4940 lines
207 KiB
Python
4940 lines
207 KiB
Python
from __future__ import annotations
|
||
|
||
import datetime as _dt
|
||
import hashlib
|
||
import json
|
||
import os
|
||
import re
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
|
||
|
||
def _sha256_text(text: str) -> str:
|
||
return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
|
||
|
||
|
||
def _sha256_file(path: str) -> str:
|
||
h = hashlib.sha256()
|
||
with open(path, "rb") as f:
|
||
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||
h.update(chunk)
|
||
return h.hexdigest()
|
||
|
||
|
||
def generate_shadow_dossier(*, style_id: str, source_text: str, source_path: str, action_pack: bool = False) -> str:
|
||
if style_id.lower() in {
|
||
"if.dave.v1",
|
||
"if.dave.v1.1",
|
||
"if.dave.v1.2",
|
||
"if.dave.v1.3",
|
||
"if.dave.v1.6",
|
||
"if.dave.v1.7",
|
||
"if.dave.v1.8",
|
||
"if.dave.v1.9",
|
||
"if.dave.v2.0",
|
||
"if.dave.v2.1",
|
||
"if.dave.v2.2",
|
||
"if.dave.v2.3",
|
||
"if.dave.fr.v1.2",
|
||
"if.dave.fr.v1.3",
|
||
"dave",
|
||
"if://bible/dave/v1.0",
|
||
"if://bible/dave/v1.1",
|
||
"if://bible/dave/v1.2",
|
||
"if://bible/dave/v1.3",
|
||
"if://bible/dave/v1.6",
|
||
"if://bible/dave/v1.7",
|
||
"if://bible/dave/v1.8",
|
||
"if://bible/dave/v1.9",
|
||
"if://bible/dave/v2.0",
|
||
"if://bible/dave/v2.1",
|
||
"if://bible/dave/v2.2",
|
||
"if://bible/dave/v2.3",
|
||
"if://bible/dave/fr/v1.2",
|
||
"if://bible/dave/fr/v1.3",
|
||
}:
|
||
style = style_id.lower()
|
||
locale = "fr" if style in {"if.dave.fr.v1.2", "if.dave.fr.v1.3", "if://bible/dave/fr/v1.2", "if://bible/dave/fr/v1.3"} else "en"
|
||
if style in {
|
||
"if.dave.v2.0",
|
||
"if.dave.v2.1",
|
||
"if.dave.v2.2",
|
||
"if.dave.v2.3",
|
||
"if://bible/dave/v2.0",
|
||
"if://bible/dave/v2.1",
|
||
"if://bible/dave/v2.2",
|
||
"if://bible/dave/v2.3",
|
||
}:
|
||
return _generate_dave_v1_8_mirror(
|
||
source_text=source_text,
|
||
source_path=source_path,
|
||
action_pack=action_pack,
|
||
locale=locale,
|
||
style_version=(
|
||
"v2.3"
|
||
if style in {"if.dave.v2.3", "if://bible/dave/v2.3"}
|
||
else (
|
||
"v2.2"
|
||
if style in {"if.dave.v2.2", "if://bible/dave/v2.2"}
|
||
else ("v2.1" if style in {"if.dave.v2.1", "if://bible/dave/v2.1"} else "v2.0")
|
||
)
|
||
),
|
||
)
|
||
if style in {"if.dave.v1.9", "if://bible/dave/v1.9"}:
|
||
return _generate_dave_v1_8_mirror(
|
||
source_text=source_text,
|
||
source_path=source_path,
|
||
action_pack=action_pack,
|
||
locale=locale,
|
||
style_version="v1.9",
|
||
)
|
||
if style in {"if.dave.v1.8", "if://bible/dave/v1.8"}:
|
||
return _generate_dave_v1_8_mirror(
|
||
source_text=source_text,
|
||
source_path=source_path,
|
||
action_pack=action_pack,
|
||
locale=locale,
|
||
style_version="v1.8",
|
||
)
|
||
if style in {"if.dave.v1.7", "if://bible/dave/v1.7"}:
|
||
return _generate_dave_v1_7_mirror(
|
||
source_text=source_text,
|
||
source_path=source_path,
|
||
action_pack=action_pack,
|
||
locale=locale,
|
||
)
|
||
if style in {"if.dave.v1.6", "if://bible/dave/v1.6"}:
|
||
return _generate_dave_v1_6_mirror(
|
||
source_text=source_text,
|
||
source_path=source_path,
|
||
action_pack=action_pack,
|
||
locale=locale,
|
||
)
|
||
if style in {"if.dave.v1.3", "if.dave.fr.v1.3", "if://bible/dave/v1.3", "if://bible/dave/fr/v1.3"}:
|
||
return _generate_dave_v1_3_mirror(
|
||
source_text=source_text,
|
||
source_path=source_path,
|
||
action_pack=action_pack,
|
||
locale=locale,
|
||
)
|
||
return _generate_dave_v1_2_mirror(
|
||
source_text=source_text,
|
||
source_path=source_path,
|
||
action_pack=action_pack,
|
||
locale=locale,
|
||
)
|
||
raise ValueError(f"Unknown style id: {style_id}")
|
||
|
||
@dataclass(frozen=True)
|
||
class _SourceSection:
|
||
title: str
|
||
body: str
|
||
why_it_matters: str | None = None
|
||
|
||
@dataclass
|
||
class _RenderContext:
|
||
seed: str
|
||
locale: str = "en"
|
||
voice: str = "v1.3"
|
||
used_callouts: set[str] = field(default_factory=set)
|
||
used_diagrams: set[str] = field(default_factory=set)
|
||
used_paragraphs: set[str] = field(default_factory=set)
|
||
used_reframe_tails: set[str] = field(default_factory=set)
|
||
used_punchlines: set[str] = field(default_factory=set)
|
||
|
||
def pick_unique(self, *, kind: str, key: str, variants: list[str], used: set[str]) -> str:
|
||
if not variants:
|
||
raise ValueError(f"Missing variants for {kind}")
|
||
|
||
digest = hashlib.sha256(f"{self.seed}:{kind}:{key}".encode("utf-8", errors="replace")).digest()
|
||
start = int.from_bytes(digest[:4], "big") % len(variants)
|
||
|
||
for offset in range(len(variants)):
|
||
candidate = variants[(start + offset) % len(variants)].strip()
|
||
if candidate and candidate not in used:
|
||
used.add(candidate)
|
||
return candidate
|
||
|
||
candidate = variants[start].strip()
|
||
if candidate:
|
||
used.add(candidate)
|
||
return candidate
|
||
|
||
|
||
_PAGE_SPLIT_RE = re.compile(r"(?m)^===== page-(\d+) =====$")
|
||
_URL_RE = re.compile(r"https?://\S+")
|
||
_OWASP_TOC_LEADER_RE = re.compile(r"\.\s*\.\s*\.")
|
||
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
||
_TOC_ENTRY_RE = re.compile(r"^\s*(?P<title>.+?)\s+(?:\.\s*){3,}\s+(?P<page>\d+)\s*$")
|
||
_METRIC_VALUE_RE = re.compile(r"^(?:\$[\d,]+|\d+%|\d+-month)$")
|
||
_METRIC_TOKEN_RE = re.compile(r"\$[\d,]+|\b\d+%|\b\d+-month\b")
|
||
_NUMERIC_ANCHOR_RE = re.compile(
|
||
r"\$[\d,]+"
|
||
r"|\b\d+%|\b\d+-month\b"
|
||
r"|\bn\s*=\s*\d+\b"
|
||
r"|\b20\d{2}\b"
|
||
r"|\b\d+(?:[.,]\d+)?\s*(?:€|eur|euros)\b"
|
||
r"|\b\d+(?:[.,]\d+)?\s*(?:ans|jours|mois)\b"
|
||
r"|\b\d+(?:[.,]\d+)?\s*(?:million|milliard|milliards|bn|billion|billions)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
_OWASP_REFERENCE_ITEM_RE = re.compile(r"^\s*(?P<num>\d+)\.\s*(?P<text>\S.*)$")
|
||
_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2,3})\s*$")
|
||
_ALPHA_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]")
|
||
|
||
_OWASP_LLM_SUBHEADINGS = [
|
||
"Description",
|
||
"Types of Prompt Injection Vulnerabilities",
|
||
"Common Examples of Vulnerability",
|
||
"Common Examples of Risks",
|
||
"Common Examples of Risk",
|
||
"Prevention and Mitigation Strategies",
|
||
"Example Attack Scenarios",
|
||
"Sample Attack Scenarios",
|
||
"Reference Links",
|
||
"Related Frameworks and Taxonomies",
|
||
]
|
||
|
||
_DAVE_REFRAME_TAILS = [
|
||
"so we can align on an owner, a gate, and an expiry date",
|
||
"so we can socialize the stop condition and keep it on the roadmap",
|
||
"so we can circle back next sprint with a merge-blocking rule",
|
||
"so we can make this a deliverable instead of a vibe",
|
||
"so we can keep stakeholder comfort high and risk acceptance time-bounded",
|
||
"so we can preserve plausible deniability while still enforcing something real",
|
||
"so we can put a pin in it until Legal is comfortable",
|
||
"so we can capture it as an action item with an owner and a deadline",
|
||
"so we can align on a plan to align on the plan, with an owner and an expiry date",
|
||
"so we can turn this into a gate instead of a calendar invite",
|
||
"so we can measure outcomes in something other than meeting attendance",
|
||
"so we can keep this on the roadmap without letting it live in production forever",
|
||
]
|
||
|
||
_DAVE_REFRAME_TAILS_V16 = [
|
||
"so we can route it to governance and avoid irreversible commitments",
|
||
"so we can defer the decision behind a review cycle with an owner and an expiry date",
|
||
"so we can treat this as an artifact until the audit window closes",
|
||
"so we can keep the narrative intact while the enforcement stays optional",
|
||
"so we can record it as progress without changing production behavior",
|
||
"so we can time-box the exception and pretend it was a strategy",
|
||
"so we can preserve plausible deniability while still blocking something measurable",
|
||
"so we can turn it into a stop condition instead of a slide title",
|
||
"so we can align on who owns the risk, not who owns the deck",
|
||
"so we can keep it off the incident timeline and on the roadmap",
|
||
"so we can make drift visible without making accountability mandatory",
|
||
"so we can keep the decision pending until it becomes irrelevant",
|
||
]
|
||
|
||
_DAVE_REFRAME_TAILS_V17 = [
|
||
"so we can log it as progress and revisit after the next reorg",
|
||
"so we can align on ownership and leave the failure mode unchanged",
|
||
"so we can treat the exception as temporary architecture",
|
||
"so we can redefine the KPI until the dashboard goes green",
|
||
"so we can route it through procurement and call it risk reduction",
|
||
"so we can hand it to Legal and mark it as \"in review\" indefinitely",
|
||
"so we can schedule a steering committee and preserve optionality",
|
||
"so we can gate it in theory and waive it in practice",
|
||
"so we can add it to the roadmap and remove it from enforcement",
|
||
"so we can treat silence as compliance until the audit window closes",
|
||
"so we can move the proof to a shared folder and call it governance",
|
||
"so we can make the control visible and keep it unenforceable",
|
||
]
|
||
|
||
_DAVE_REFRAME_TAILS_FR = [
|
||
"afin que nous puissions aligner un responsable, un contrôle en amont et une date d’expiration",
|
||
"afin que nous puissions le socialiser avec les parties prenantes et figer un critère de blocage",
|
||
"afin que nous puissions y revenir au prochain cycle avec une règle réellement opposable",
|
||
"afin que nous puissions en faire un livrable plutôt qu’une intention",
|
||
"afin que nous puissions préserver le confort des parties prenantes tout en bornant l’acceptation du risque",
|
||
"afin que nous puissions rester aimables sur la forme tout en étant vérifiables sur le fond",
|
||
"afin que nous puissions ouvrir une concertation et formaliser une décision",
|
||
"afin que nous puissions consigner un point d’action, avec un responsable et une échéance",
|
||
"afin que nous puissions aligner un plan d’alignement, avec un responsable et une date d’expiration",
|
||
"afin que nous puissions en faire une porte de contrôle plutôt qu’une invitation calendrier",
|
||
"afin que nous puissions mesurer autre chose que la présence en réunion",
|
||
"afin que nous puissions garder cela sur la feuille de route sans le laisser vivre en production indéfiniment",
|
||
]
|
||
|
||
|
||
def _looks_like_site_footer(line: str) -> bool:
|
||
s = (line or "").strip()
|
||
if not s:
|
||
return False
|
||
upper = s.upper()
|
||
if upper.startswith("WWW.") and "." in upper and len(upper) <= 64:
|
||
return True
|
||
if re.fullmatch(r"[A-Z0-9.-]+\.[A-Z]{2,}", upper) and len(upper) <= 64:
|
||
return True
|
||
return False
|
||
|
||
|
||
def _looks_like_navigation_heavy_source(text: str) -> bool:
|
||
"""
|
||
Heuristic to detect HTML→MD dumps where navigation dominates the payload
|
||
(e.g., long menus, repeated link lists, "Skip to content", JS void links).
|
||
This is used only for mirror-completeness scoring (not for content edits).
|
||
"""
|
||
raw_lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()]
|
||
if len(raw_lines) < 40:
|
||
return False
|
||
|
||
linkish = 0
|
||
js_void = 0
|
||
for ln in raw_lines[:400]:
|
||
low = ln.lower()
|
||
if "javascript:void" in low:
|
||
js_void += 1
|
||
if "](" in ln or low.startswith("http://") or low.startswith("https://"):
|
||
linkish += 1
|
||
|
||
link_ratio = linkish / max(1, min(len(raw_lines), 400))
|
||
if js_void >= 1 and link_ratio >= 0.25:
|
||
return True
|
||
if link_ratio >= 0.45:
|
||
return True
|
||
|
||
nav_markers = (
|
||
"skip to content",
|
||
"skip to main content",
|
||
"search search",
|
||
"menu",
|
||
"column one",
|
||
"column two",
|
||
"column three",
|
||
"all rights reserved",
|
||
"cookie",
|
||
"privacy",
|
||
"terms",
|
||
)
|
||
marker_hits = sum(1 for ln in raw_lines[:220] if any(m in ln.lower() for m in nav_markers))
|
||
return marker_hits >= 6
|
||
|
||
|
||
def _looks_like_cover_subtitle_noise(value: str) -> bool:
|
||
"""
|
||
Heuristic: cover subtitles should be "title-ish" (short, headline-like),
|
||
not body sentences, author blocks, or explanatory prose.
|
||
"""
|
||
s = " ".join((value or "").split()).strip()
|
||
if not s:
|
||
return True
|
||
|
||
# Body fragments often start mid-sentence (lowercase).
|
||
if re.match(r"^[a-z]", s):
|
||
return True
|
||
|
||
# Cover subtitles should be short; long multi-clause prose is usually body copy.
|
||
if len(s.split()) > 18:
|
||
return True
|
||
|
||
# Long prose ending with a period is rarely a subtitle for these sources.
|
||
if s.endswith(".") and len(s) > 60:
|
||
return True
|
||
|
||
low = s.lower()
|
||
if "from left to right" in low:
|
||
return True
|
||
|
||
# Author/credential blocks (common in analyst PDFs) aren't useful as subtitles.
|
||
if re.search(r"\b(cissp|ccsk|phd|research director|business value manager)\b", low):
|
||
return True
|
||
|
||
# Many commas in a long line suggests author list / affiliations.
|
||
if s.count(",") >= 3 and len(s) > 80:
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
def _extract_urls(text: str) -> list[str]:
|
||
urls: list[str] = []
|
||
for match in _URL_RE.finditer(text):
|
||
url = match.group(0).rstrip(").,;:]>\"'")
|
||
if url not in urls:
|
||
urls.append(url)
|
||
return urls
|
||
|
||
|
||
def _extract_owasp_reference_items(text: str) -> list[str]:
|
||
items: list[tuple[str, list[str]]] = []
|
||
cur_num: str | None = None
|
||
cur_parts: list[str] = []
|
||
|
||
def flush() -> None:
|
||
nonlocal cur_num, cur_parts
|
||
if cur_num is None:
|
||
return
|
||
joined = " ".join([p.strip() for p in cur_parts if p.strip()]).strip()
|
||
joined = re.sub(r"\s{2,}", " ", joined).strip()
|
||
if joined:
|
||
items.append((cur_num, [joined]))
|
||
cur_num = None
|
||
cur_parts = []
|
||
|
||
for ln in (text or "").splitlines():
|
||
m = _OWASP_REFERENCE_ITEM_RE.match(ln)
|
||
if m:
|
||
flush()
|
||
cur_num = m.group("num")
|
||
cur_parts = [m.group("text").strip()]
|
||
continue
|
||
|
||
if cur_num is None:
|
||
continue
|
||
|
||
s = ln.strip()
|
||
if not s:
|
||
continue
|
||
if s.startswith("genai.owasp.org") or "OWASP Top 10 for LLM" in s:
|
||
continue
|
||
cur_parts.append(s)
|
||
|
||
flush()
|
||
|
||
out: list[str] = []
|
||
for num, parts in items:
|
||
text = " ".join(parts).strip()
|
||
if text:
|
||
out.append(f"{num}. {text}")
|
||
return out
|
||
|
||
|
||
def _inject_reframe_tail(line: str, tail: str) -> str:
|
||
m = re.match(r"^(?P<prefix>\s*>\s*)(?P<body>.*)$", line.rstrip())
|
||
if not m:
|
||
return line.rstrip()
|
||
|
||
prefix = m.group("prefix")
|
||
body = m.group("body").rstrip()
|
||
if not body or "—" in body:
|
||
return f"{prefix}{body}".rstrip()
|
||
|
||
closers = ""
|
||
while body and body[-1] in {")", "]", '"', "'", "”", "’"}:
|
||
closers = body[-1] + closers
|
||
body = body[:-1]
|
||
|
||
punct = "."
|
||
if body and body[-1] in {".", "!", "?"}:
|
||
punct = body[-1]
|
||
body = body[:-1]
|
||
|
||
body = body.rstrip()
|
||
return f"{prefix}{body} — {tail}{punct}{closers}".rstrip()
|
||
|
||
|
||
def _inject_plain_tail(text: str, tail: str) -> str:
|
||
body = (text or "").strip()
|
||
if not body or "—" in body:
|
||
return body
|
||
|
||
closers = ""
|
||
while body and body[-1] in {")", "]", '"', "'", "”", "’"}:
|
||
closers = body[-1] + closers
|
||
body = body[:-1]
|
||
|
||
punct = "."
|
||
if body and body[-1] in {".", "!", "?"}:
|
||
punct = body[-1]
|
||
body = body[:-1]
|
||
|
||
body = body.rstrip()
|
||
return f"{body} — {tail}{punct}{closers}".strip()
|
||
|
||
|
||
def _daveify_callout_reframe(callout: str, *, ctx: _RenderContext, key: str) -> str:
|
||
lines = callout.splitlines()
|
||
for idx in range(len(lines) - 1, -1, -1):
|
||
stripped = lines[idx].lstrip()
|
||
if not stripped.startswith(">"):
|
||
continue
|
||
if re.match(r"^>\s*\*\*[^*]+\*\*:", stripped):
|
||
continue
|
||
if ctx.locale.lower().startswith("fr"):
|
||
tails = _DAVE_REFRAME_TAILS_FR
|
||
else:
|
||
tails = (
|
||
_DAVE_REFRAME_TAILS_V17
|
||
if ctx.voice == "v1.7"
|
||
else _DAVE_REFRAME_TAILS_V16
|
||
if ctx.voice == "v1.6"
|
||
else _DAVE_REFRAME_TAILS
|
||
)
|
||
tail = ctx.pick_unique(kind="reframe_tail", key=key, variants=tails, used=ctx.used_reframe_tails)
|
||
lines[idx] = _inject_reframe_tail(lines[idx], tail)
|
||
return "\n".join(lines).strip()
|
||
return callout.strip()
|
||
|
||
|
||
def _looks_like_owasp_llm_top10(text: str) -> bool:
|
||
if "OWASP Top 10 for" not in text and "OWASP Top 10" not in text:
|
||
return False
|
||
return "LLM01" in text and "LLM10" in text
|
||
|
||
|
||
def _looks_like_idc_business_value(text: str) -> bool:
|
||
if "Business Value White Paper" not in text:
|
||
return False
|
||
if "Table of Contents" not in text:
|
||
return False
|
||
return "IDC #" in text or "IDC" in text
|
||
|
||
|
||
def _paragraphs_from_lines(text: str) -> list[str]:
|
||
paragraphs: list[str] = []
|
||
buf: list[str] = []
|
||
for ln in text.splitlines():
|
||
s = ln.strip()
|
||
if not s:
|
||
if buf:
|
||
paragraphs.append(" ".join(buf).strip())
|
||
buf = []
|
||
continue
|
||
buf.append(s)
|
||
if buf:
|
||
paragraphs.append(" ".join(buf).strip())
|
||
paragraphs = [re.sub(r"\s{2,}", " ", p).strip() for p in paragraphs if p.strip()]
|
||
return paragraphs
|
||
|
||
|
||
def _first_sentences(text: str, *, max_sentences: int = 2, max_chars: int = 260) -> str:
|
||
paragraphs = _paragraphs_from_lines(text)
|
||
if not paragraphs:
|
||
return ""
|
||
sentences: list[str] = []
|
||
for para in paragraphs[:3]:
|
||
for sent in _SENTENCE_SPLIT_RE.split(para):
|
||
cleaned = sent.strip()
|
||
if cleaned:
|
||
sentences.append(cleaned)
|
||
if len(sentences) >= max_sentences:
|
||
break
|
||
if len(sentences) >= max_sentences:
|
||
break
|
||
snippet = " ".join(sentences).strip()
|
||
if len(snippet) > max_chars:
|
||
snippet = snippet[: max_chars - 1].rstrip() + "…"
|
||
return snippet
|
||
|
||
|
||
def _extract_numeric_anchors(text: str, *, limit: int = 3) -> list[str]:
|
||
anchors: list[str] = []
|
||
for match in _NUMERIC_ANCHOR_RE.finditer(text or ""):
|
||
value = match.group(0).strip()
|
||
if not value:
|
||
continue
|
||
normalized = re.sub(r"\s+", "", value.lower())
|
||
if normalized not in {re.sub(r"\s+", "", a.lower()) for a in anchors}:
|
||
anchors.append(value)
|
||
if len(anchors) >= limit:
|
||
break
|
||
return anchors
|
||
|
||
|
||
def _split_owasp_llm_subsections(body: str) -> list[tuple[str, str]]:
|
||
headings = set(_OWASP_LLM_SUBHEADINGS)
|
||
lines = [ln.rstrip() for ln in body.splitlines()]
|
||
|
||
parts: list[tuple[str, str]] = []
|
||
cur: str | None = None
|
||
buf: list[str] = []
|
||
|
||
def flush() -> None:
|
||
nonlocal cur, buf
|
||
if cur is None:
|
||
return
|
||
parts.append((cur, "\n".join(buf).strip()))
|
||
cur = None
|
||
buf = []
|
||
|
||
for ln in lines:
|
||
s = ln.strip()
|
||
if s in headings:
|
||
flush()
|
||
cur = s
|
||
buf = []
|
||
continue
|
||
if cur is None:
|
||
continue
|
||
buf.append(ln)
|
||
|
||
flush()
|
||
return parts
|
||
|
||
|
||
def _extract_idc_highlight_metrics(body: str) -> list[tuple[str, str]]:
|
||
lines = [ln.rstrip("\n") for ln in body.splitlines()]
|
||
# Find a line with three metric tokens to derive column boundaries.
|
||
col_starts: list[int] = []
|
||
for ln in lines:
|
||
tokens = list(_METRIC_TOKEN_RE.finditer(ln))
|
||
if len(tokens) >= 3:
|
||
col_starts = [tokens[0].start(), tokens[1].start(), tokens[2].start()]
|
||
break
|
||
|
||
if not col_starts:
|
||
return []
|
||
|
||
starts = col_starts + [10_000]
|
||
|
||
metrics_by_col: list[list[tuple[str, list[str]]]] = [[], [], []]
|
||
current: list[tuple[str, list[str]] | None] = [None, None, None]
|
||
|
||
def flush(col: int) -> None:
|
||
if current[col] is None:
|
||
return
|
||
value, parts = current[col]
|
||
desc = " ".join([p.strip() for p in parts if p.strip()]).strip()
|
||
metrics_by_col[col].append((value, [desc] if desc else []))
|
||
current[col] = None
|
||
|
||
for ln in lines:
|
||
# Stop parsing if the next major heading begins.
|
||
if ln.strip() == "Executive Summary":
|
||
break
|
||
|
||
padded = ln + " " * 5
|
||
cols = []
|
||
for i in range(3):
|
||
segment = padded[starts[i] : starts[i + 1]].strip()
|
||
cols.append(segment)
|
||
|
||
for i, seg in enumerate(cols):
|
||
if not seg:
|
||
continue
|
||
if _METRIC_VALUE_RE.match(seg):
|
||
flush(i)
|
||
current[i] = (seg, [])
|
||
continue
|
||
if current[i] is None:
|
||
continue
|
||
current[i][1].append(seg)
|
||
|
||
for i in range(3):
|
||
flush(i)
|
||
|
||
flattened: list[tuple[str, str]] = []
|
||
for col_metrics in metrics_by_col:
|
||
for value, desc_parts in col_metrics:
|
||
desc = desc_parts[0] if desc_parts else ""
|
||
if desc:
|
||
flattened.append((value, desc))
|
||
return flattened
|
||
|
||
|
||
def _normalize_ocr(text: str) -> str:
|
||
text = re.sub(r"\bAl\b", "AI", text)
|
||
text = text.replace("GenAl", "GenAI")
|
||
text = text.replace("Cl/CD", "CI/CD")
|
||
text = text.replace("olugin", "plugin")
|
||
text = text.replace("FORIWARD", "FORWARD")
|
||
text = text.replace("\\u00agrave", "à")
|
||
return text
|
||
|
||
|
||
_UNICODE_QUOTE_MAP = str.maketrans(
|
||
{
|
||
"\u201c": '"',
|
||
"\u201d": '"',
|
||
"\u2018": "'",
|
||
"\u2019": "'",
|
||
"\u2013": "-",
|
||
"\u2014": "-",
|
||
"\u00a0": " ",
|
||
}
|
||
)
|
||
|
||
|
||
def _normalize_unicode_punctuation(text: str) -> str:
|
||
return str(text or "").translate(_UNICODE_QUOTE_MAP)
|
||
|
||
|
||
def _coerce_json_code_block(code: str) -> tuple[str, str, bool, bool]:
|
||
cleaned = _normalize_unicode_punctuation(code).strip()
|
||
jsonish = cleaned.lstrip().startswith("{") or cleaned.lstrip().startswith("[")
|
||
try:
|
||
obj = json.loads(cleaned)
|
||
except Exception:
|
||
return "text", cleaned, False, jsonish
|
||
return "json", json.dumps(obj, indent=2, ensure_ascii=False), True, True
|
||
|
||
|
||
def _parse_pages(source_text: str) -> list[tuple[str, str]]:
|
||
matches = list(_PAGE_SPLIT_RE.finditer(source_text))
|
||
if not matches:
|
||
if "\f" in source_text:
|
||
parts = [p.strip() for p in source_text.split("\f")]
|
||
pages: list[tuple[str, str]] = []
|
||
page_no = 1
|
||
for part in parts:
|
||
if part:
|
||
pages.append((str(page_no), part))
|
||
page_no += 1
|
||
return pages or [("doc", source_text.strip())]
|
||
return [("doc", source_text.strip())]
|
||
|
||
pages: list[tuple[str, str]] = []
|
||
for idx, match in enumerate(matches):
|
||
page_no = match.group(1)
|
||
start = match.end()
|
||
end = matches[idx + 1].start() if idx + 1 < len(matches) else len(source_text)
|
||
pages.append((page_no, source_text[start:end].strip()))
|
||
return pages
|
||
|
||
|
||
def _parse_title_block(lines: list[str]) -> tuple[str, int]:
|
||
i = 0
|
||
while i < len(lines) and not lines[i].strip():
|
||
i += 1
|
||
title_lines: list[str] = []
|
||
# Title blocks should be short; OCR/PDF extraction sometimes concatenates body text into the "title".
|
||
# Heuristic: keep up to a few short lines and stop before body-like lines (long sentences, URLs, etc.).
|
||
max_title_lines = 3
|
||
max_title_words = 14
|
||
max_title_chars = 110
|
||
max_total_words = 18
|
||
max_total_chars = 120
|
||
total_words = 0
|
||
total_chars = 0
|
||
while i < len(lines) and lines[i].strip():
|
||
stripped = lines[i].strip()
|
||
lower = stripped.lower()
|
||
if lower == "snyk":
|
||
i += 1
|
||
continue
|
||
# Skip common page-header noise (e.g., "… | Datasheet 1").
|
||
if "|" in stripped and "datasheet" in lower:
|
||
i += 1
|
||
continue
|
||
# If the very first non-empty line is already "body-like", synthesize a short title
|
||
# from it but keep the full line in the body (do not consume it).
|
||
word_count = len(stripped.split())
|
||
looks_body_like = (
|
||
len(stripped) > max_title_chars
|
||
or word_count > max_title_words
|
||
or bool(re.search(r"\\s{3,}", stripped))
|
||
or "http://" in lower
|
||
or "https://" in lower
|
||
or (stripped.endswith(".") and word_count > 8)
|
||
)
|
||
if not title_lines and looks_body_like:
|
||
title_lines.append(_compact_title(stripped, max_chars=72))
|
||
break
|
||
# Stop title capture when we hit body-like lines.
|
||
if title_lines:
|
||
if "http://" in lower or "https://" in lower:
|
||
break
|
||
if len(stripped) > max_title_chars:
|
||
break
|
||
if word_count > max_title_words:
|
||
break
|
||
if stripped.endswith(".") and word_count > 8:
|
||
break
|
||
# Global caps: don't let multiple short lines turn into a paragraph-sized title.
|
||
if total_words + word_count > max_total_words:
|
||
break
|
||
if total_chars + len(stripped) > max_total_chars:
|
||
break
|
||
if len(title_lines) >= max_title_lines:
|
||
break
|
||
title_lines.append(stripped)
|
||
total_words += len(stripped.split())
|
||
total_chars += len(stripped)
|
||
i += 1
|
||
while i < len(lines) and not lines[i].strip():
|
||
i += 1
|
||
title = " ".join(title_lines).strip() or "UNTITLED"
|
||
return title, i
|
||
|
||
|
||
def _extract_title_above(lines: list[str], why_idx: int) -> str:
|
||
j = why_idx - 1
|
||
while j >= 0 and not lines[j].strip():
|
||
j -= 1
|
||
|
||
title_lines: list[str] = []
|
||
while j >= 0 and lines[j].strip():
|
||
title_lines.append(lines[j].strip())
|
||
j -= 1
|
||
title_lines.reverse()
|
||
|
||
k = j
|
||
while k >= 0 and not lines[k].strip():
|
||
k -= 1
|
||
if k >= 0 and re.fullmatch(r"\d{1,3}", lines[k].strip()):
|
||
title_lines.insert(0, lines[k].strip())
|
||
|
||
title = " ".join(title_lines).strip()
|
||
match = re.match(r"^(\d{1,3})\s+(.+)$", title)
|
||
if match:
|
||
label = match.group(1)
|
||
if len(label) == 3 and label.startswith("0"):
|
||
label = label[:2]
|
||
title = f"{label} — {match.group(2)}"
|
||
return title
|
||
|
||
|
||
def _parse_sections_from_page(page_text: str) -> list[_SourceSection]:
|
||
lines = [ln.rstrip() for ln in page_text.splitlines()]
|
||
why_idxs = [i for i, ln in enumerate(lines) if ln.strip().lower().startswith("why it matters:")]
|
||
|
||
if not why_idxs:
|
||
title, body_start = _parse_title_block(lines)
|
||
body = "\n".join([ln for ln in lines[body_start:] if ln.strip() and ln.strip().lower() != "snyk"]).strip()
|
||
return [_SourceSection(title=title, body=body, why_it_matters=None)]
|
||
|
||
sections: list[_SourceSection] = []
|
||
for idx, why_idx in enumerate(why_idxs):
|
||
title = _extract_title_above(lines, why_idx)
|
||
end = why_idxs[idx + 1] if idx + 1 < len(why_idxs) else len(lines)
|
||
why = lines[why_idx].strip()
|
||
body = "\n".join(lines[why_idx + 1 : end]).strip()
|
||
sections.append(_SourceSection(title=title, body=body, why_it_matters=why))
|
||
return sections
|
||
|
||
|
||
def _clean_page_body(text: str) -> str:
|
||
lines: list[str] = []
|
||
for ln in (text or "").splitlines():
|
||
s = ln.strip()
|
||
if not s:
|
||
if lines and lines[-1] != "":
|
||
lines.append("")
|
||
continue
|
||
if _looks_like_site_footer(s):
|
||
continue
|
||
lines.append(ln.rstrip())
|
||
return "\n".join(lines).strip()
|
||
|
||
|
||
def _parse_numbered_section_page(page_text: str) -> tuple[str, str, str] | None:
|
||
lines = [ln.rstrip() for ln in (page_text or "").splitlines()]
|
||
i = 0
|
||
while i < len(lines):
|
||
s = lines[i].strip()
|
||
if not s:
|
||
i += 1
|
||
continue
|
||
if _looks_like_site_footer(s):
|
||
i += 1
|
||
continue
|
||
break
|
||
if i >= len(lines):
|
||
return None
|
||
|
||
m = _NUMBERED_SECTION_START_RE.match(lines[i].strip())
|
||
if not m:
|
||
return None
|
||
|
||
num = m.group("num")
|
||
# Some PDFs/OCR runs concatenate the section label with a nearby page marker, e.g. `035` for section `03`.
|
||
if len(num) == 3 and num.startswith("0"):
|
||
num = num[:2]
|
||
if len(num) != 2:
|
||
return None
|
||
|
||
j = i + 1
|
||
while j < len(lines) and not lines[j].strip():
|
||
j += 1
|
||
|
||
title_lines: list[str] = []
|
||
while j < len(lines) and lines[j].strip():
|
||
s = lines[j].strip()
|
||
if _looks_like_site_footer(s):
|
||
break
|
||
title_lines.append(s)
|
||
j += 1
|
||
|
||
title = " ".join(title_lines).strip()
|
||
if not title or not _ALPHA_RE.search(title):
|
||
return None
|
||
|
||
while j < len(lines) and not lines[j].strip():
|
||
j += 1
|
||
|
||
body = _clean_page_body("\n".join(lines[j:]))
|
||
return (num, title, body)
|
||
|
||
|
||
def _extract_sections_numbered_outline(pages: list[tuple[str, str]]) -> list[_SourceSection] | None:
|
||
starts = 0
|
||
for _page_no, page_text in pages:
|
||
if _parse_numbered_section_page(page_text):
|
||
starts += 1
|
||
if starts < 3:
|
||
return None
|
||
|
||
cover_parts: list[str] = []
|
||
sections: list[_SourceSection] = []
|
||
cur_title: str | None = None
|
||
cur_body_parts: list[str] = []
|
||
|
||
def flush_cover() -> None:
|
||
nonlocal cover_parts
|
||
cover_text = "\n\n".join([p for p in cover_parts if p.strip()]).strip()
|
||
cover_parts = []
|
||
if not cover_text:
|
||
return
|
||
sections.append(_SourceSection(title="COUVERTURE", body=_clean_page_body(cover_text), why_it_matters=None))
|
||
|
||
def flush_section() -> None:
|
||
nonlocal cur_title, cur_body_parts
|
||
if cur_title is None:
|
||
return
|
||
body = "\n\n".join([p for p in cur_body_parts if p.strip()]).strip()
|
||
sections.append(_SourceSection(title=cur_title, body=_clean_page_body(body), why_it_matters=None))
|
||
cur_title = None
|
||
cur_body_parts = []
|
||
|
||
for _page_no, page_text in pages:
|
||
parsed = _parse_numbered_section_page(page_text)
|
||
if parsed:
|
||
num, title, body = parsed
|
||
if cur_title is None:
|
||
flush_cover()
|
||
else:
|
||
flush_section()
|
||
cur_title = f"{num} — {title}"
|
||
cur_body_parts = [body] if body else []
|
||
continue
|
||
|
||
cleaned = _clean_page_body(page_text)
|
||
if not cleaned:
|
||
continue
|
||
if cur_title is None:
|
||
cover_parts.append(cleaned)
|
||
else:
|
||
cur_body_parts.append(cleaned)
|
||
|
||
if cur_title is None:
|
||
flush_cover()
|
||
else:
|
||
flush_section()
|
||
|
||
return sections or None
|
||
|
||
|
||
def _extract_sections(source_text: str) -> list[_SourceSection]:
|
||
if _looks_like_owasp_llm_top10(source_text):
|
||
return _extract_sections_owasp_llm_top10(source_text)
|
||
if _looks_like_idc_business_value(source_text):
|
||
return _extract_sections_idc_business_value(source_text)
|
||
|
||
pages = _parse_pages(source_text)
|
||
numbered = _extract_sections_numbered_outline(pages)
|
||
if numbered:
|
||
return numbered
|
||
sections: list[_SourceSection] = []
|
||
for _page_no, page_text in pages:
|
||
if page_text.strip():
|
||
sections.extend(_parse_sections_from_page(page_text))
|
||
if sections:
|
||
return sections
|
||
|
||
# Fallback: lightweight Markdown heading parsing for HTML→MD mirrors and other non-PDF inputs
|
||
# where page/outline heuristics fail (e.g., long navigation-heavy pages).
|
||
sections = _extract_sections_markdown_headings(source_text)
|
||
if sections:
|
||
return sections
|
||
|
||
# Last-resort: keep the document reviewable (and Action Pack-able) even when structure is poor.
|
||
fallback_title = _first_non_empty_line(source_text) or "Source"
|
||
fallback_body = _compact_body(source_text, max_chars=12000)
|
||
return [_SourceSection(title=fallback_title, body=fallback_body, why_it_matters=None)]
|
||
|
||
|
||
def _first_non_empty_line(text: str) -> str | None:
|
||
for ln in text.splitlines():
|
||
s = ln.strip()
|
||
if s:
|
||
return s
|
||
return None
|
||
|
||
|
||
def _compact_body(text: str, *, max_chars: int) -> str:
|
||
if max_chars <= 0:
|
||
return ""
|
||
s = "\n".join([ln.rstrip() for ln in text.splitlines()]).strip()
|
||
if len(s) <= max_chars:
|
||
return s
|
||
# Keep a clean boundary so downstream renderers don't inherit half-glyph garbage.
|
||
cut = s.rfind("\n", 0, max_chars)
|
||
if cut < int(max_chars * 0.6):
|
||
cut = max_chars
|
||
return s[:cut].rstrip() + "\n\n…"
|
||
|
||
|
||
def _extract_sections_markdown_headings(source_text: str) -> list[_SourceSection]:
|
||
lines = [ln.rstrip("\n") for ln in source_text.splitlines()]
|
||
if not lines:
|
||
return []
|
||
|
||
sections: list[_SourceSection] = []
|
||
cur_title: str | None = None
|
||
cur_body: list[str] = []
|
||
|
||
def flush() -> None:
|
||
nonlocal cur_title, cur_body
|
||
if cur_title is None:
|
||
return
|
||
body = "\n".join(cur_body).strip()
|
||
sections.append(_SourceSection(title=cur_title.strip(), body=body, why_it_matters=None))
|
||
cur_title = None
|
||
cur_body = []
|
||
|
||
def is_underline_heading(idx: int) -> bool:
|
||
if idx + 1 >= len(lines):
|
||
return False
|
||
title = lines[idx].strip()
|
||
underline = lines[idx + 1].strip()
|
||
if not title or not underline:
|
||
return False
|
||
if set(underline) == {"="} or set(underline) == {"-"}:
|
||
return len(underline) >= 3 and len(underline) >= max(3, int(len(title) * 0.5))
|
||
return False
|
||
|
||
i = 0
|
||
while i < len(lines):
|
||
raw = lines[i]
|
||
s = raw.strip()
|
||
if not s:
|
||
if cur_title is not None and (cur_body and cur_body[-1] != ""):
|
||
cur_body.append("")
|
||
i += 1
|
||
continue
|
||
|
||
if s.startswith("#"):
|
||
title = s.lstrip("#").strip()
|
||
if title:
|
||
flush()
|
||
cur_title = title
|
||
cur_body = []
|
||
i += 1
|
||
continue
|
||
|
||
if is_underline_heading(i):
|
||
flush()
|
||
cur_title = lines[i].strip()
|
||
cur_body = []
|
||
i += 2
|
||
continue
|
||
|
||
if cur_title is None:
|
||
# Ignore leading navigation / boilerplate until we see a heading.
|
||
i += 1
|
||
continue
|
||
|
||
cur_body.append(raw.rstrip())
|
||
i += 1
|
||
|
||
flush()
|
||
|
||
# Filter out empty shells (heading with no body) but keep at least one section if any exists.
|
||
non_empty = [s for s in sections if (s.body or "").strip()]
|
||
return non_empty or sections
|
||
|
||
|
||
def _owasp_clean_lines(lines: list[str]) -> list[str]:
|
||
cleaned: list[str] = []
|
||
for ln in lines:
|
||
s = ln.strip()
|
||
if not s:
|
||
cleaned.append("")
|
||
continue
|
||
if s.startswith("OWASP Top 10 for LLM Applications"):
|
||
continue
|
||
if "genai.owasp.org" in s:
|
||
continue
|
||
if s.isdigit() and len(s) <= 3:
|
||
continue
|
||
# Drop sponsor-logo garbage / broken glyph runs.
|
||
if sum(1 for ch in s if " " <= ch <= "~") < max(4, int(len(s) * 0.35)):
|
||
continue
|
||
cleaned.append(s)
|
||
return cleaned
|
||
|
||
|
||
def _owasp_looks_like_toc_entry(line: str) -> bool:
|
||
s = line.strip()
|
||
if not s:
|
||
return False
|
||
if not _OWASP_TOC_LEADER_RE.search(s):
|
||
return False
|
||
return bool(re.search(r"\d\s*$", s))
|
||
|
||
|
||
def _extract_sections_owasp_llm_top10(source_text: str) -> list[_SourceSection]:
|
||
pages = source_text.split("\f")
|
||
if not pages:
|
||
return []
|
||
|
||
cover_lines = [ln.rstrip() for ln in pages[0].splitlines()]
|
||
cover_title, cover_start = _parse_title_block(cover_lines)
|
||
cover_body = "\n".join([ln for ln in cover_lines[cover_start:] if ln.strip()]).strip()
|
||
|
||
sections: list[_SourceSection] = [_SourceSection(title=cover_title, body=cover_body, why_it_matters=None)]
|
||
|
||
major_exact = {
|
||
"LICENSE AND USAGE",
|
||
"REVISION HISTORY",
|
||
"Table of Contents",
|
||
"Letter from the Project Leads",
|
||
"What’s New in the 2025 Top 10",
|
||
"What's New in the 2025 Top 10",
|
||
"Moving Forward",
|
||
"Project Sponsors",
|
||
}
|
||
llm_re = re.compile(r"^LLM\d{2}:")
|
||
appendix_re = re.compile(r"^Appendix\s+\d+:")
|
||
|
||
lines: list[str] = []
|
||
for pg in pages[1:]:
|
||
lines.extend(_owasp_clean_lines([ln.rstrip("\n") for ln in pg.splitlines()]))
|
||
lines.append("")
|
||
|
||
cur_title: str | None = None
|
||
cur_body: list[str] = []
|
||
|
||
def flush() -> None:
|
||
nonlocal cur_title, cur_body
|
||
if cur_title is None:
|
||
return
|
||
body = "\n".join(cur_body).strip()
|
||
sections.append(_SourceSection(title=cur_title, body=body, why_it_matters=None))
|
||
cur_title = None
|
||
cur_body = []
|
||
|
||
for ln in lines:
|
||
s = ln.strip()
|
||
if not s:
|
||
if cur_title is not None and (cur_body and cur_body[-1] != ""):
|
||
cur_body.append("")
|
||
continue
|
||
|
||
is_heading = False
|
||
if s in major_exact:
|
||
is_heading = True
|
||
elif llm_re.match(s) and not _owasp_looks_like_toc_entry(s):
|
||
is_heading = True
|
||
elif appendix_re.match(s) and not _owasp_looks_like_toc_entry(s):
|
||
is_heading = True
|
||
|
||
if is_heading:
|
||
flush()
|
||
cur_title = s
|
||
cur_body = []
|
||
continue
|
||
|
||
if cur_title is None:
|
||
continue
|
||
cur_body.append(s)
|
||
|
||
flush()
|
||
return sections
|
||
|
||
|
||
def _idc_clean_lines(lines: list[str]) -> list[str]:
|
||
cleaned: list[str] = []
|
||
for ln in lines:
|
||
s = ln.strip()
|
||
if not s:
|
||
cleaned.append("")
|
||
continue
|
||
if "Business Value White Paper" in s:
|
||
continue
|
||
if s.startswith("January ") and "IDC #" in s:
|
||
continue
|
||
if "Return to Highlights" in s:
|
||
continue
|
||
if s.isdigit() and len(s) <= 3:
|
||
continue
|
||
cleaned.append(ln.rstrip())
|
||
return cleaned
|
||
|
||
|
||
def _normalize_heading(value: str) -> str:
|
||
cleaned = re.sub(r"[^A-Za-z0-9]+", " ", value).strip().upper()
|
||
return re.sub(r"\s{2,}", " ", cleaned)
|
||
|
||
|
||
def _extract_idc_toc_entries(pages: list[list[str]]) -> list[tuple[str, int]]:
|
||
toc_page = None
|
||
toc_line_idx = None
|
||
for p_idx, page_lines in enumerate(pages):
|
||
for l_idx, ln in enumerate(page_lines):
|
||
if ln.strip() == "Table of Contents":
|
||
toc_page = p_idx
|
||
toc_line_idx = l_idx
|
||
break
|
||
if toc_page is not None:
|
||
break
|
||
|
||
if toc_page is None or toc_line_idx is None:
|
||
return []
|
||
|
||
entries: list[tuple[str, int]] = []
|
||
for ln in pages[toc_page][toc_line_idx + 1 :]:
|
||
match = _TOC_ENTRY_RE.match(ln.strip())
|
||
if not match:
|
||
continue
|
||
title = match.group("title").strip().rstrip(".").strip()
|
||
try:
|
||
page_no = int(match.group("page"))
|
||
except ValueError:
|
||
continue
|
||
if not title:
|
||
continue
|
||
entries.append((title, page_no))
|
||
|
||
return entries
|
||
|
||
|
||
def _find_heading_anchor(*, pages: list[list[str]], title: str, page_no: int) -> tuple[int, int, str]:
|
||
norm_title = _normalize_heading(title)
|
||
start_page = max(0, min(len(pages) - 1, page_no - 1))
|
||
|
||
for p_idx in range(start_page, min(len(pages), start_page + 3)):
|
||
for l_idx, ln in enumerate(pages[p_idx]):
|
||
s = ln.strip()
|
||
if not s:
|
||
continue
|
||
if _TOC_ENTRY_RE.match(s):
|
||
continue
|
||
if _normalize_heading(s) == norm_title:
|
||
return p_idx, l_idx, s
|
||
return start_page, 0, title
|
||
|
||
|
||
def _slice_pages(
|
||
*, pages: list[list[str]], start: tuple[int, int], end: tuple[int, int] | None
|
||
) -> list[str]:
|
||
start_page, start_line = start
|
||
if end is None:
|
||
end_page, end_line = len(pages), 0
|
||
else:
|
||
end_page, end_line = end
|
||
|
||
out: list[str] = []
|
||
for p_idx in range(start_page, min(end_page + 1, len(pages))):
|
||
lines = pages[p_idx]
|
||
lo = start_line if p_idx == start_page else 0
|
||
hi = end_line if (end is not None and p_idx == end_page) else len(lines)
|
||
out.extend(lines[lo:hi])
|
||
out.append("")
|
||
return out
|
||
|
||
|
||
def _extract_sections_idc_business_value(source_text: str) -> list[_SourceSection]:
|
||
raw_pages = source_text.split("\f")
|
||
if not raw_pages:
|
||
return []
|
||
|
||
cover_lines = [ln.rstrip() for ln in raw_pages[0].splitlines()]
|
||
cover_title = next(
|
||
(
|
||
ln.strip()
|
||
for ln in cover_lines
|
||
if ln.strip()
|
||
and "Business Value White Paper" not in ln
|
||
and "THIS PDF USES" not in ln
|
||
and "|" not in ln
|
||
),
|
||
None,
|
||
)
|
||
if not cover_title:
|
||
cover_title, _ = _parse_title_block(cover_lines)
|
||
|
||
try:
|
||
title_idx = next(i for i, ln in enumerate(cover_lines) if ln.strip() == cover_title)
|
||
except StopIteration:
|
||
title_idx = 0
|
||
cover_body = "\n".join([ln for ln in cover_lines[title_idx + 1 :] if ln.strip()]).strip()
|
||
|
||
pages = [_idc_clean_lines([ln for ln in pg.splitlines()]) for pg in raw_pages]
|
||
|
||
sections: list[_SourceSection] = [_SourceSection(title=cover_title, body=cover_body, why_it_matters=None)]
|
||
|
||
toc_entries = _extract_idc_toc_entries(pages)
|
||
if toc_entries:
|
||
toc_lines = ["- " + t for t, _p in toc_entries]
|
||
sections.append(_SourceSection(title="Table of Contents", body="\n".join(toc_lines), why_it_matters=None))
|
||
|
||
anchors: list[tuple[str, tuple[int, int], str]] = []
|
||
for title, page_no in toc_entries:
|
||
p_idx, l_idx, found_title = _find_heading_anchor(pages=pages, title=title, page_no=page_no)
|
||
anchors.append((found_title, (p_idx, l_idx), title))
|
||
|
||
# De-duplicate anchors that point to the same place (can happen with repeating headers).
|
||
uniq: list[tuple[str, tuple[int, int], str]] = []
|
||
seen: set[tuple[int, int, str]] = set()
|
||
for found_title, anchor, toc_title in anchors:
|
||
key = (anchor[0], anchor[1], _normalize_heading(toc_title))
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
uniq.append((found_title, anchor, toc_title))
|
||
|
||
for idx, (found_title, anchor, toc_title) in enumerate(uniq):
|
||
next_anchor = uniq[idx + 1][1] if idx + 1 < len(uniq) else None
|
||
lines = _slice_pages(pages=pages, start=anchor, end=next_anchor)
|
||
# Drop the heading line itself if it repeats at the start.
|
||
while lines and not lines[0].strip():
|
||
lines.pop(0)
|
||
if lines and _normalize_heading(lines[0].strip()) == _normalize_heading(found_title):
|
||
lines.pop(0)
|
||
body = "\n".join(lines).strip()
|
||
sections.append(_SourceSection(title=found_title or toc_title, body=body, why_it_matters=None))
|
||
|
||
return sections
|
||
|
||
|
||
def _has(text: str, *needles: str) -> bool:
|
||
lowered = text.lower()
|
||
return any(n.lower() in lowered for n in needles)
|
||
|
||
|
||
def _extract_code_block(body: str) -> str | None:
|
||
lines = [ln.rstrip() for ln in body.splitlines()]
|
||
start = next((i for i, ln in enumerate(lines) if ln.strip().startswith("{")), None)
|
||
if start is None:
|
||
return None
|
||
end = None
|
||
for i in range(start, len(lines)):
|
||
if lines[i].strip() == "}":
|
||
end = i
|
||
break
|
||
if end is None:
|
||
return None
|
||
return "\n".join(lines[start : end + 1])
|
||
|
||
|
||
def _extract_access_report(body: str) -> str | None:
|
||
if "Gemini Access Report" not in body:
|
||
return None
|
||
|
||
lines = [ln.strip() for ln in body.splitlines() if ln.strip()]
|
||
try:
|
||
idx = lines.index("Gemini Access Report")
|
||
except ValueError:
|
||
return None
|
||
|
||
rows: list[list[str]] = []
|
||
for ln in lines[idx + 1 : idx + 8]:
|
||
if "@" not in ln:
|
||
continue
|
||
parts = [p for p in re.split(r"\s{2,}", ln) if p]
|
||
if len(parts) >= 5:
|
||
rows.append(parts[:5])
|
||
|
||
if not rows:
|
||
return None
|
||
|
||
header = ["Name", "Email", "License Assigned", "Last Active", "Last Detected Scan"]
|
||
out = [
|
||
"### Gemini Access Report",
|
||
"",
|
||
"| " + " | ".join(header) + " |",
|
||
"| " + " | ".join(["---"] * len(header)) + " |",
|
||
]
|
||
for r in rows:
|
||
out.append("| " + " | ".join(r) + " |")
|
||
return "\n".join(out)
|
||
|
||
|
||
def _extract_quiz(body: str) -> str | None:
|
||
if "Quiz" not in body:
|
||
return None
|
||
if "What must you do" not in body and "What must you do if you want access" not in body:
|
||
return None
|
||
|
||
question_match = re.search(r"(What must you do[^\n\r]+)", body)
|
||
question = question_match.group(1).strip() if question_match else "Quiz"
|
||
|
||
options = []
|
||
for ln in body.splitlines():
|
||
stripped = ln.strip(" >•\t")
|
||
if stripped.startswith("Include") or stripped.startswith("Install") or stripped.startswith("Download"):
|
||
options.append(stripped)
|
||
|
||
if not options:
|
||
return None
|
||
|
||
out = ["### Quiz", "", f"**{question}**", ""]
|
||
out.extend([f"- {opt}" for opt in options])
|
||
return "\n".join(out)
|
||
|
||
|
||
def _extract_form(body: str) -> str | None:
|
||
if "Code Assistant Access Request Form" not in body:
|
||
return None
|
||
return "\n".join(
|
||
[
|
||
"### Code Assistant Access Request Form",
|
||
"",
|
||
"- Upload a screenshot showing the security IDE plugin is installed for local testing.",
|
||
"- Provide any additional context on the request.",
|
||
"- Attest that the AI coding assistant will be used in conjunction with local scanning.",
|
||
]
|
||
)
|
||
|
||
|
||
def _slugify(value: str) -> str:
|
||
cleaned = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-")
|
||
cleaned = re.sub(r"-{2,}", "-", cleaned)
|
||
return cleaned.upper() if cleaned else "UNKNOWN"
|
||
|
||
|
||
def _inferred_mermaid(title: str, *, ctx: _RenderContext) -> str | None:
|
||
title_upper = title.upper()
|
||
|
||
if "BUSINESS VALUE" in title_upper or "ROI" in title_upper:
|
||
variants = [
|
||
"""flowchart TD
|
||
A["Sponsor narrative"] --> B["Business value model"]
|
||
B --> C["Executive buy-in"]
|
||
C --> D["Rollout project"]
|
||
D --> E["Evidence artifacts produced"]
|
||
E --> F["Renewal discussion"]
|
||
F --> G["KPI trend deck"]
|
||
G --> C
|
||
""",
|
||
"""flowchart TD
|
||
A["Baseline (unknown)"] --> B["ROI spreadsheet"]
|
||
B --> C["Assumptions: optimistic"]
|
||
C --> D["Rollout work"]
|
||
D --> E["Exceptions + manual steps"]
|
||
E --> F["Metric redefinition"]
|
||
F --> B
|
||
""",
|
||
"""flowchart TD
|
||
A["Procurement decision"] --> B["Implementation project"]
|
||
B --> C["Evidence automation"]
|
||
C --> D["Audit season"]
|
||
D --> E["Renewal negotiation"]
|
||
E --> F["Success story deck"]
|
||
F --> A
|
||
""",
|
||
"""flowchart TD
|
||
A["Executive sponsor"] --> B["Quarterly update deck"]
|
||
B --> C["KPI trend (directional)"]
|
||
C --> D["Roadmap refresh"]
|
||
D --> E["Pilot expansion"]
|
||
E --> B
|
||
""",
|
||
]
|
||
return ctx.pick_unique(kind="diagram:roi", key=title, variants=variants, used=ctx.used_diagrams)
|
||
|
||
if "COMPLIANCE" in title_upper or "AUDIT" in title_upper:
|
||
variants = [
|
||
"""flowchart TD
|
||
A["Control requirement"] --> B["Evidence requested"]
|
||
B --> C["Artifact gathered"]
|
||
C --> D["Review meeting"]
|
||
D --> E{Approved?}
|
||
E -->|Yes| F["Audit satisfied"]
|
||
E -->|No| G["Remediation plan"]
|
||
G --> D
|
||
""",
|
||
"""flowchart TD
|
||
A["Control requirement"] --> B["Evidence request"]
|
||
B --> C["Screenshot collected"]
|
||
C --> D["Shared drive folder"]
|
||
D --> E["Checklist satisfied"]
|
||
E --> F["Exceptions accumulate"]
|
||
F --> B
|
||
""",
|
||
"""flowchart TD
|
||
A["Quarter begins"] --> B["Evidence scramble"]
|
||
B --> C["Spreadsheet status"]
|
||
C --> D["Steering committee"]
|
||
D --> E["Audit passed"]
|
||
E --> F["Backlog deferred"]
|
||
F --> A
|
||
""",
|
||
]
|
||
return ctx.pick_unique(kind="diagram:audit", key=title, variants=variants, used=ctx.used_diagrams)
|
||
|
||
if "THIRD-PARTY" in title_upper or "VENDOR" in title_upper:
|
||
variants = [
|
||
"""flowchart TD
|
||
A["Vendor onboarding"] --> B["Questionnaire"]
|
||
B --> C["Evidence chase"]
|
||
C --> D["Risk rating"]
|
||
D --> E{Exception?}
|
||
E -->|Yes| F["Accepted with notes"]
|
||
E -->|No| G["Blocked pending controls"]
|
||
F --> H["Renewal cycle"]
|
||
G --> H
|
||
""",
|
||
"""flowchart TD
|
||
A["Vendor intake"] --> B["Security questionnaire"]
|
||
B --> C["Vendor sends PDF"]
|
||
C --> D["Internal interpretation"]
|
||
D --> E["Legal review"]
|
||
E --> F{Decision}
|
||
F -->|Accept| G["Approved with exceptions"]
|
||
F -->|Block| H["Deferred to next quarter"]
|
||
G --> I["Renewal"]
|
||
H --> I
|
||
""",
|
||
"""flowchart TD
|
||
A["Business wants tool"] --> B["Vendor risk rating"]
|
||
B --> C{High risk?}
|
||
C -->|Yes| D["Exception workflow"]
|
||
C -->|No| E["Standard approval"]
|
||
D --> F["Compensating controls"]
|
||
F --> E
|
||
E --> G["Onboarding complete"]
|
||
G --> H["Usage begins"]
|
||
H --> I["Reassessment"]
|
||
I --> B
|
||
""",
|
||
]
|
||
return ctx.pick_unique(kind="diagram:third_party", key=title, variants=variants, used=ctx.used_diagrams)
|
||
|
||
if title_upper.startswith("LLM01") or "PROMPT INJECTION" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm01",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Attacker prompt"] --> B["LLM prompt parser"]
|
||
B --> C["System prompt + tools"]
|
||
C --> D["Model follows injected instruction"]
|
||
D --> E["Unsafe action or data exposure"]
|
||
E --> F["Incident review meeting"]
|
||
F --> G["Policy update: scheduled"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM02") or "SENSITIVE INFORMATION" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm02",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["User asks a question"] --> B["LLM retrieves context"]
|
||
B --> C["Hidden secret present in context"]
|
||
C --> D["Model outputs secret"]
|
||
D --> E["Screenshot captured for compliance"]
|
||
E --> F["Access remains enabled"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM03") or "SUPPLY CHAIN" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm03",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Upstream model or dependency"] --> B["Pulled into build"]
|
||
B --> C["Trusted by default"]
|
||
C --> D["Compromise introduced"]
|
||
D --> E["Shipped to production"]
|
||
E --> F["Vendor asks for logs"]
|
||
F --> G["We align on next steps"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM04") or "POISONING" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm04",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Attacker data"] --> B["Training or fine-tune"]
|
||
B --> C["Model behavior shifts"]
|
||
C --> D["Bad outputs in production"]
|
||
D --> E["Root cause: unclear"]
|
||
E --> F["New dataset review committee"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM05") or "OUTPUT HANDLING" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm05",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["LLM generates output"] --> B["Output treated as trusted"]
|
||
B --> C["Downstream system executes or renders"]
|
||
C --> D["Injection hits a sink"]
|
||
D --> E["Hotfix + postmortem"]
|
||
E --> F["Guardrail doc updated"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM06") or "EXCESSIVE AGENCY" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm06",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["User goal"] --> B["Agent plans steps"]
|
||
B --> C["Tool access granted"]
|
||
C --> D["Action executed"]
|
||
D --> E["Unexpected side effect"]
|
||
E --> F["Exception request filed"]
|
||
F --> C
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM07") or "PROMPT LEAKAGE" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm07",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["User prompt"] --> B["Model context window"]
|
||
B --> C["System prompt present"]
|
||
C --> D["Leak via output or tool call"]
|
||
D --> E["Prompt rotated quarterly"]
|
||
E --> C
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM08") or "VECTOR" in title_upper or "EMBEDDING" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm08",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Documents ingested"] --> B["Embeddings store"]
|
||
B --> C["Retriever selects chunks"]
|
||
C --> D["Injected chunk included"]
|
||
D --> E["LLM follows malicious context"]
|
||
E --> F["We add a filter later"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM09") or "MISINFORMATION" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm09",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Model output"] --> B["Looks confident"]
|
||
B --> C["Decision made"]
|
||
C --> D["Outcome fails"]
|
||
D --> E["Retroactive citations requested"]
|
||
E --> F["Alignment session"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("LLM10") or "UNBOUNDED CONSUMPTION" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:llm10",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Request"] --> B["Tokens consumed"]
|
||
B --> C["Costs rise"]
|
||
C --> D["Rate limit suggested"]
|
||
D --> E["Exception granted"]
|
||
E --> B
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper.startswith("APPENDIX 1") or "ARCHITECTURE" in title_upper:
|
||
is_llm_context = any(k in title_upper for k in ["LLM", "MODEL", "RAG", "PROMPT"])
|
||
return ctx.pick_unique(
|
||
kind="diagram:architecture",
|
||
key=title,
|
||
variants=[
|
||
(
|
||
"""flowchart TD
|
||
A["User"] --> B["App"]
|
||
B --> C["LLM"]
|
||
C --> D["Tools"]
|
||
C --> E["RAG store"]
|
||
D --> F["External systems"]
|
||
E --> C
|
||
"""
|
||
if is_llm_context
|
||
else """flowchart TD
|
||
A["Actor"] --> B["Workflow / system"]
|
||
B --> C["Policy decision (rules)"]
|
||
C --> D["Gate: enforce / block"]
|
||
D --> E["Evidence signals (logs)"]
|
||
E --> F["Audit / review cycle"]
|
||
F --> C
|
||
"""
|
||
)
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if "PULL REQUEST" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:pull_request",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Code change"] --> B["Pull request opened"]
|
||
B --> C["Automated scan: PR checks"]
|
||
C --> D{Findings?}
|
||
D -->|None| E["Merge"]
|
||
D -->|Some| F["Ticket created"]
|
||
F --> G["Exception request"]
|
||
G --> H["Alignment session"]
|
||
H --> I["Risk accepted: documented"]
|
||
I --> E
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if "SHIFTING LEFT" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:shift_left",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Developer writes code"] --> B["IDE scan: local"]
|
||
B --> C{Issue found?}
|
||
C -->|Yes| D["Fix now"]
|
||
C -->|No| E["Commit"]
|
||
E --> F["PR checks"]
|
||
A --> G["Agent workflow"]
|
||
G --> H["Local MCP scan"]
|
||
H --> E
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if "REQUEST EVIDENCE" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:request_evidence",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Developer requests access"] --> B["Upload screenshot"]
|
||
B --> C["Attestation captured"]
|
||
C --> D["Access enabled"]
|
||
D --> E["Local testing: claimed"]
|
||
E --> F["Periodic audit"]
|
||
F --> G{Still compliant?}
|
||
G -->|Yes| D
|
||
G -->|No| H["Access paused pending review"]
|
||
H --> I["Alignment session"]
|
||
I --> D
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if title_upper == "AUDIT":
|
||
return ctx.pick_unique(
|
||
kind="diagram:audit_generic",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Collect usage signals"] --> B["Correlate assistants vs scans"]
|
||
B --> C["Identify gaps"]
|
||
C --> D["Notify developers"]
|
||
D --> E["Remediation window"]
|
||
E --> F["Dashboard update"]
|
||
F --> G["Quarterly KPI trend review"]
|
||
G --> H["Action items: optional"]
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if "TRAINING" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:training",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Onboarding"] --> B["Training module"]
|
||
B --> C["Quiz"]
|
||
C --> D{Pass?}
|
||
D -->|Yes| E["Certificate issued"]
|
||
D -->|No| F["Retake scheduled"]
|
||
E --> G["Access request approved"]
|
||
G --> H["Usage begins"]
|
||
H --> I["Refresher cadence"]
|
||
I --> B
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if "ACCESS CONTROL" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:access_control",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Policy defined"] --> B["Endpoint management"]
|
||
B --> C{Prerequisites met?}
|
||
C -->|Yes| D["Assistant enabled"]
|
||
C -->|No| E["Blocked by policy"]
|
||
E --> F["Exception request"]
|
||
F --> G["Owner approval"]
|
||
G --> D
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if "PATH FOR" in title_upper:
|
||
return ctx.pick_unique(
|
||
kind="diagram:path_forward",
|
||
key=title,
|
||
variants=[
|
||
"""flowchart TD
|
||
A["Desire: secure innovation"] --> B["Guardrails planned"]
|
||
B --> C["Pilot cohort"]
|
||
C --> D["Deck + FAQ"]
|
||
D --> E["Stakeholder alignment"]
|
||
E --> F["Incremental rollout"]
|
||
F --> G["Measure adoption"]
|
||
G --> H["Reframe as iteration"]
|
||
H --> E
|
||
"""
|
||
],
|
||
used=ctx.used_diagrams,
|
||
)
|
||
|
||
if ctx.locale.lower().startswith("fr"):
|
||
title_clean = re.sub(r"\s{2,}", " ", title.strip())
|
||
title_safe = title_clean.replace('"', "'")
|
||
m = re.match(r"^(?P<num>\d{2})\s*(?:—|-)?\s*(?P<rest>.*)$", title_clean)
|
||
sec = m.group("num") if m else None
|
||
|
||
if sec == "01" or "RÉFORME" in title_upper or "FACTURATION" in title_upper:
|
||
return f"""flowchart TD
|
||
A["Réforme 2026"] --> B["Facture au format Factur-X"]
|
||
B --> C["Transmission via PDP/PPF"]
|
||
C --> D["Statuts: acceptée / rejetée / payée"]
|
||
D --> E["Archivage: 10 ans"]
|
||
E --> F["Contrôle / audit"]
|
||
F --> G["Comité de pilotage"]
|
||
G --> A
|
||
"""
|
||
|
||
if sec == "02" or "RISQU" in title_upper or "N’ANTICIP" in title_upper or "N'ANTICIP" in title_upper:
|
||
return f"""flowchart TD
|
||
A["Outils fragmentés"] --> B["Ressaisies"]
|
||
B --> C["Erreurs"]
|
||
C --> D["Retard de facturation"]
|
||
D --> E["Retard d'encaissement"]
|
||
E --> F["Tension de trésorerie"]
|
||
F --> G["Pression interne"]
|
||
G --> A
|
||
"""
|
||
|
||
if sec == "03" or "SANS ERP" in title_upper:
|
||
return f"""flowchart TD
|
||
A["Excel / outil simple"] --> B["Conversion Factur-X"]
|
||
B --> C["Connexion à une PDP"]
|
||
C --> D["Suivi manuel des statuts"]
|
||
D --> E["Archivage légal séparé"]
|
||
E --> F["Outils + abonnements en plus"]
|
||
F --> G["Sources d'erreurs"]
|
||
G --> A
|
||
"""
|
||
|
||
if sec == "04" or "ERP" in title_upper:
|
||
return f"""flowchart TD
|
||
A["Temps + projets"] --> B["Facturation"]
|
||
B --> C["Transmission via PDP"]
|
||
C --> D["Statuts automatiques"]
|
||
D --> E["Archivage conforme"]
|
||
E --> F["Marge projet"]
|
||
F --> G["Pilotage"]
|
||
G --> A
|
||
"""
|
||
|
||
if sec == "05" or "AUTO" in title_upper or "DIAGNOSTIC" in title_upper or "GAGNEZ" in title_upper:
|
||
return f"""flowchart TD
|
||
A["Automatisation"] --> B["Coût unitaire en baisse"]
|
||
B --> C["Visibilité: marges + encaissements"]
|
||
C --> D["Décisions"]
|
||
D --> E["Processus stabilisé"]
|
||
E --> F["Conformité durable"]
|
||
F --> A
|
||
"""
|
||
|
||
if sec == "06" or "PLAN D'ACTION" in title_upper or "PLAN D’ACTION" in title_upper:
|
||
return f"""flowchart TD
|
||
A["Diagnostic"] --> B["Flux critiques"]
|
||
B --> C["Cartographie des outils"]
|
||
C --> D["Environnement unique"]
|
||
D --> E["Formation + adoption"]
|
||
E --> F["Prêt avant l'échéance"]
|
||
"""
|
||
|
||
return f"""flowchart TD
|
||
A["Section: {title_safe}"] --> B["Alignement des parties prenantes"]
|
||
B --> C["Décision: à qualifier"]
|
||
C --> D["Plan: à socialiser"]
|
||
D --> E["Revue: au prochain comité"]
|
||
E --> B
|
||
"""
|
||
|
||
return None
|
||
|
||
|
||
def _render_inferred_diagram(title: str, *, ctx: _RenderContext) -> str | None:
|
||
diagram = _inferred_mermaid(title, ctx=ctx)
|
||
if not diagram:
|
||
return None
|
||
heading = (
|
||
"### Schéma InfraFabric Red Team (inféré)"
|
||
if ctx.locale.lower().startswith("fr")
|
||
else "### InfraFabric Red Team Diagram (Inferred)"
|
||
)
|
||
return "\n".join(
|
||
[
|
||
heading,
|
||
"",
|
||
"```mermaid",
|
||
diagram.rstrip(),
|
||
"```",
|
||
]
|
||
)
|
||
|
||
|
||
def _render_dave_factor_callout(section: _SourceSection, *, ctx: _RenderContext) -> str | None:
|
||
title_upper = section.title.upper()
|
||
excerpt = f"{section.title}\n{section.why_it_matters or ''}\n{section.body}".strip()
|
||
|
||
if "BUSINESS VALUE" in title_upper or "ROI" in title_upper:
|
||
variants = [
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** The ROI model becomes the control, and the control becomes the explanation for why reality must align to the spreadsheet.",
|
||
"> **Countermeasure:** Define baseline metrics, instrument time-to-evidence, and set stop conditions for exceptions and manual work.",
|
||
"> The problem isn't ROI. The problem is ROI quietly becoming the approval mechanism for work that never gets decommissioned.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
'> **The Dave Factor:** The metric becomes the mission, and the mission becomes "protect the number" once reality diverges.',
|
||
'> **Countermeasure:** Publish assumptions, define "done", and require evidence that automation replaced work (not just moved it).',
|
||
"> This is not a metrics problem. This is a governance problem: once the number exists, the org optimizes for the number.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
'> **The Dave Factor:** "Payback in 3 months" becomes a deadline for narrative, not delivery, so we measure what ships and call it impact.',
|
||
"> **Countermeasure:** Time-box pilots, set exit criteria, and make renewals contingent on measured outcomes (not sentiment).",
|
||
"> The failure is not payback. The failure is narrative deadlines replacing delivery, with a dashboard standing in for evidence.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** ROI turns into Return on Inaction: the spreadsheet is used to justify not touching the legacy process.",
|
||
"> **Countermeasure:** Put an owner on decommissioning manual steps, and make exception expiry automatic and enforced.",
|
||
"> The problem isn't the legacy process. The problem is declaring it \"heritage\" the moment removal would require ownership.",
|
||
]
|
||
),
|
||
]
|
||
callout = ctx.pick_unique(kind="callout:roi", key=section.title, variants=variants, used=ctx.used_callouts)
|
||
return _daveify_callout_reframe(callout, ctx=ctx, key=section.title)
|
||
if "COMPLIANCE" in title_upper or "AUDIT" in title_upper:
|
||
variants = [
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Evidence collection becomes the product, and the product becomes a shared drive with strong opinions.",
|
||
"> **Countermeasure:** Make evidence machine-generated, time-bounded, and verifiable (with owners and expiry).",
|
||
'> The problem isn\'t collecting evidence. The problem is evidence that requires a guided tour and a Slack thread to "interpret" in context.',
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Evidence becomes a museum: artifacts are preserved forever, but control effectiveness is optional.",
|
||
"> **Countermeasure:** Prefer telemetry over screenshots; make evidence time-bounded and continuously sampled.",
|
||
"> The problem isn't retention. The problem is artifacts outliving the controls they were supposed to prove.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Audit readiness becomes a seasonal sport; we optimize for the week before the auditor arrives.",
|
||
"> **Countermeasure:** Automate evidence generation, alert on drift, and treat missing signals as a stop condition.",
|
||
"> The problem isn't the audit. The problem is treating audit week as the only time the system is allowed to be real.",
|
||
]
|
||
),
|
||
]
|
||
callout = ctx.pick_unique(kind="callout:audit", key=section.title, variants=variants, used=ctx.used_callouts)
|
||
return _daveify_callout_reframe(callout, ctx=ctx, key=section.title)
|
||
if "THIRD-PARTY" in title_upper:
|
||
variants = [
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Third-party risk becomes a questionnaire supply chain, where the slowest vendor defines your security posture.",
|
||
"> **Countermeasure:** Standardize evidence requests and automate reminders, while enforcing a clear accept/block decision path.",
|
||
'> The problem is not vendor risk. The problem is nobody owns the "no" when revenue needs a "yes" on paper.',
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Third-party risk becomes hot-potato routing: everyone can forward the form, nobody can block the deal.",
|
||
"> **Countermeasure:** Define a single decision owner and a binary accept/block path with documented compensating controls.",
|
||
"> This is not a paperwork problem. It is a decision problem: the form moves, the authority doesn't.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Vendor assurance becomes vendor storytelling, and we rate the PDF instead of the reality.",
|
||
"> **Countermeasure:** Standardize evidence types, require freshness, and make renewals contingent on verified controls.",
|
||
"> The problem isn't the PDF. The problem is we grade the PDF because the implementation is not observable.",
|
||
]
|
||
),
|
||
]
|
||
callout = ctx.pick_unique(
|
||
kind="callout:third_party", key=section.title, variants=variants, used=ctx.used_callouts
|
||
)
|
||
return _daveify_callout_reframe(callout, ctx=ctx, key=section.title)
|
||
|
||
if title_upper.startswith("LLM01") or "PROMPT INJECTION" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** The prompt becomes the policy, and the policy becomes a suggestion once customers start asking nicely.",
|
||
"> **Countermeasure:** Treat prompts as code: version them, test them, and gate tool-use behind explicit allowlists.",
|
||
"> The failure isn't a clever prompt. The failure is treating free-form text like an authorization layer.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM02") or "SENSITIVE INFORMATION" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Redaction becomes a meeting, and meetings are not a data loss prevention strategy.",
|
||
"> **Countermeasure:** Minimize secret exposure to the model, redact upstream, and add output filters with stop conditions.",
|
||
"> The problem isn't redaction. The problem is letting sensitive data reach the model and hoping policy will catch up later.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM03") or "SUPPLY CHAIN" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** We inherit risk at the speed of `pip install` while accountability ships quarterly.",
|
||
"> **Countermeasure:** Pin + verify artifacts, require SBOMs, and make provenance a merge gate, not a slide.",
|
||
"> This is not a dependency problem. It is a provenance problem: if you can't verify it, you can't ship it.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM04") or "POISONING" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Training data is treated as a vibe, so model drift is treated as a surprise.",
|
||
"> **Countermeasure:** Track dataset lineage, add poisoning checks, and keep rollback paths for fine-tunes.",
|
||
"> The problem isn't drift. The problem is unowned datasets quietly becoming production behavior.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM05") or "OUTPUT HANDLING" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** The model output is interpreted as intent, and intent is treated as authorization.",
|
||
"> **Countermeasure:** Validate and constrain outputs before execution; never treat free-form text as a command.",
|
||
'> The failure is not "bad output." The failure is wiring output to action without a gate that can say "no" in production.',
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM06") or "EXCESSIVE AGENCY" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Agents are given keys because it demos well, and later we discover the locks were optional.",
|
||
"> **Countermeasure:** Least privilege for tools, human confirmation for irreversible actions, and hard spend limits.",
|
||
"> The problem isn't autonomy. The problem is giving autonomy a budget, credentials, and no stop switch.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM07") or "PROMPT LEAKAGE" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** We call it a secret because it feels better than calling it user-visible configuration.",
|
||
"> **Countermeasure:** Assume prompts leak; move secrets out of prompts and verify outputs for prompt fragments.",
|
||
"> This is not a prompt problem. It is a secret-management problem that happens to be written in English.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM08") or "VECTOR" in title_upper or "EMBEDDING" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** RAG becomes \"trust the nearest chunk,\" which is a governance model with a memory problem.",
|
||
"> **Countermeasure:** Sanitize ingestion, filter retrieval, and sign/score sources so bad context can't masquerade as truth.",
|
||
"> The failure isn't retrieval. The failure is untrusted context being allowed to vote on reality.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM09") or "MISINFORMATION" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Confidence is mistaken for correctness, and correctness is postponed until after shipment.",
|
||
"> **Countermeasure:** Require citations, add verification checks, and gate decisions on evidence rather than tone.",
|
||
"> The problem isn't tone. The problem is decisions being made without a verification path an adversary would accept.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if title_upper.startswith("LLM10") or "UNBOUNDED CONSUMPTION" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Cost overruns are reframed as \"unexpected adoption,\" which is how budgets die politely.",
|
||
"> **Countermeasure:** Rate limit, cap tokens, and make spend alerts actionable (with enforced cutoffs).",
|
||
'> This is not a pricing problem. It is an absence-of-limits problem, disguised as "growth" in the monthly deck.',
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
|
||
if "PULL REQUEST" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Exceptions become the default pathway, because the policy is strict and the deadline is real.",
|
||
"> **Countermeasure:** Define merge-blocking thresholds, time-box every exception, and make expiry automatic.",
|
||
"> The problem isn't exceptions. The problem is exceptions without expiry quietly becoming the policy.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if "SHIFTING LEFT" in title_upper:
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
'> **The Dave Factor:** "Shift left" becomes "optional left," which means the same issues arrive later with better excuses.',
|
||
"> **Countermeasure:** Gate on local scan signals where possible (or require attestations that are actually checked).",
|
||
"> The failure isn't late detection. The failure is a workflow with no enforced moment where risk can be stopped.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if "REQUEST EVIDENCE" in title_upper or _has(excerpt, "screenshot", "attestation"):
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Screenshots are compliance theater: easy to collect, hard to verify, and immortal in shared drives.",
|
||
"> **Countermeasure:** Prefer verifiable telemetry (scan events) over images, and pause access when signals go dark.",
|
||
"> The problem isn't local testing. The problem is the screenshot becomes the control, and the test becomes a vibe.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if "AUDIT" in title_upper or _has(excerpt, "usage reports", "periodic audits"):
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Dashboards become a KPI trend, and KPIs become a calendar invite.",
|
||
"> **Countermeasure:** Tie the dashboard to explicit SLOs and a remediation loop with owners and deadlines.",
|
||
"> The problem isn't reporting. The problem is dashboards replacing decisions, because decisions create liability.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if "TRAINING" in title_upper or _has(excerpt, "snyk learn", "quiz"):
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Completion certificates are treated as controls, even when behavior doesn't change.",
|
||
"> **Countermeasure:** Add a practical gate (local scan + PR checks) so training is support, not the defense.",
|
||
"> The problem isn't education. The problem is treating completion as enforcement while the workflow stays permissive.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if "ACCESS CONTROL" in title_upper or _has(excerpt, "endpoint management", "prerequisites", "extensions"):
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
'> **The Dave Factor:** Access controls drift into "enablement," and enablement drifts into "we made a wiki."',
|
||
"> **Countermeasure:** Make prerequisites machine-checkable and make exceptions expire by default.",
|
||
'> The problem is not access. The problem is "enablement" becoming the polite word for bypass.',
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
if _has(title_upper, "PATH FORWARD") or _has(excerpt, "secure innovation", "talk to our team"):
|
||
return _daveify_callout_reframe(
|
||
"\n".join(
|
||
[
|
||
'> **The Dave Factor:** Pilots persist indefinitely because "graduation criteria" were never aligned.',
|
||
"> **Countermeasure:** Publish rollout milestones and a stop condition that cannot be reframed as iteration.",
|
||
"> The problem isn't pilots. The problem is pilots becoming a standing meeting with no exit criteria.",
|
||
]
|
||
),
|
||
ctx=ctx,
|
||
key=section.title,
|
||
)
|
||
|
||
if ctx.locale.lower().startswith("fr"):
|
||
anchors = _extract_numeric_anchors(section.body, limit=4)
|
||
anchor = ""
|
||
for candidate in anchors:
|
||
# Skip citation years in callouts; they read like hallucinated trivia.
|
||
if re.fullmatch(r"20\d{2}", candidate):
|
||
continue
|
||
anchor = candidate
|
||
break
|
||
anchor_hint = f" ({anchor})" if anchor else ""
|
||
variants = [
|
||
"\n".join(
|
||
[
|
||
"> **Le facteur Dave :** La conformité devient un calendrier, et le calendrier devient la preuve.",
|
||
"> **Contre-mesure :** Transformer l’obligation en portes de contrôle (émission, transmission, statut, archivage) avec un responsable et une échéance.",
|
||
f"> Le problème n’est pas le format. Le problème, c’est l’absence de portes de contrôle opposables{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **Le facteur Dave :** La multiplication des outils est présentée comme de l’agilité, jusqu’au jour où elle devient un coût fixe.",
|
||
"> **Contre-mesure :** Réduire le nombre de points de saisie et imposer une source de vérité, avec des exceptions à durée limitée.",
|
||
f"> Le problème n’est pas l’ERP. Le problème, c’est la dispersion du référentiel{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **Le facteur Dave :** On « suit les statuts » tant que cela se résume à un tableur et une réunion de fin de mois.",
|
||
"> **Contre-mesure :** Instrumenter les statuts et bloquer l’étape suivante quand un signal manque, plutôt que de le commenter.",
|
||
f"> Le problème n’est pas le suivi. Le problème, c’est l’absence de signal qui bloque réellement{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **Le facteur Dave :** L’archivage devient un dossier partagé, et le dossier partagé devient la politique.",
|
||
"> **Contre-mesure :** Exiger un archivage automatique, traçable et borné dans le temps, avec contrôle de complétude.",
|
||
f"> Le problème n’est pas l’archivage. Le problème, c’est l’impossibilité de prouver rapidement{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **Le facteur Dave :** Le « plan d’action » devient une diapositive, et la diapositive devient une décision.",
|
||
"> **Contre-mesure :** Fixer des étapes, chacune avec un livrable et un critère de sortie non négociable.",
|
||
f"> Le problème n’est pas le plan. Le problème, c’est l’absence de critères de sortie{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **Le facteur Dave :** On appelle cela « pilotage » quand il s’agit surtout de préserver la sérénité des parties prenantes.",
|
||
"> **Contre-mesure :** Choisir un indicateur, une fréquence, et un responsable qui peut dire « non » sans attendre le prochain comité.",
|
||
f"> Le problème n’est pas la rentabilité. Le problème, c’est la responsabilité qui se dissout{anchor_hint}.",
|
||
]
|
||
),
|
||
]
|
||
callout = ctx.pick_unique(kind="callout:fr:fallback", key=section.title, variants=variants, used=ctx.used_callouts)
|
||
return _daveify_callout_reframe(callout, ctx=ctx, key=section.title)
|
||
|
||
if "REFERENCE LINKS" in title_upper or title_upper.strip() in {"REFERENCES", "REFERENCE"}:
|
||
return None
|
||
if not section.body.strip():
|
||
return None
|
||
|
||
anchors = _extract_numeric_anchors(excerpt, limit=4)
|
||
anchor = ""
|
||
for candidate in anchors:
|
||
# Skip citation years in callouts; they read like hallucinated trivia.
|
||
if re.fullmatch(r"20\d{2}", candidate):
|
||
continue
|
||
anchor = candidate
|
||
break
|
||
anchor_hint = f" ({anchor})" if anchor else ""
|
||
variants = [
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** The plan becomes the status update, and the status update becomes the plan.",
|
||
"> **Countermeasure:** Name one owner, one gate, and one stop condition that blocks, not \"raises awareness.\"",
|
||
f"> The problem isn't intent. The problem is intent without an enforceable gate{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** The checklist becomes a mood board: everyone agrees, nothing blocks.",
|
||
"> **Countermeasure:** Make evidence machine-checkable, and make exceptions expire by default.",
|
||
f"> The problem isn't policy. The problem is policy that can't say \"no\" in CI{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** Alignment sessions become the control, because controls create accountability.",
|
||
"> **Countermeasure:** Replace meetings with automated gates and a remediation loop with owners and deadlines.",
|
||
f"> The problem isn't alignment. The problem is alignment replacing enforcement{anchor_hint}.",
|
||
]
|
||
),
|
||
"\n".join(
|
||
[
|
||
"> **The Dave Factor:** We treat risk as a slide, so it behaves like a slide: it moves when the deck moves.",
|
||
"> **Countermeasure:** Tie risk acceptance to an owner, an expiry date, and a verifier step.",
|
||
f"> The problem isn't risk. The problem is risk without expiry{anchor_hint}.",
|
||
]
|
||
),
|
||
]
|
||
callout = ctx.pick_unique(kind="callout:en:fallback", key=section.title, variants=variants, used=ctx.used_callouts)
|
||
return _daveify_callout_reframe(callout, ctx=ctx, key=section.title)
|
||
|
||
|
||
def _render_punchline_closer(section: _SourceSection, *, ctx: _RenderContext) -> str | None:
|
||
title_upper = section.title.strip().upper()
|
||
if title_upper in {"TABLE OF CONTENTS", "LICENSE AND USAGE", "REVISION HISTORY", "PROJECT SPONSORS"}:
|
||
return None
|
||
if not section.body.strip():
|
||
return None
|
||
|
||
anchors = _extract_numeric_anchors(f"{section.why_it_matters or ''}\n{section.body}".strip(), limit=4)
|
||
anchor = ""
|
||
for candidate in anchors:
|
||
# Avoid anchoring punchlines to random citation years unless the year is actually part of the section title.
|
||
if re.fullmatch(r"20\d{2}", candidate) and candidate not in section.title:
|
||
continue
|
||
anchor = candidate
|
||
break
|
||
anchor_hint = f" ({anchor})" if anchor else ""
|
||
|
||
if ctx.locale.lower().startswith("fr"):
|
||
variants = [
|
||
f"Nous allons être totalement alignés sur le résultat{anchor_hint}, jusqu’au moment où la première exception devient le processus par défaut.",
|
||
f"Si le calendrier est le livrable{anchor_hint}, alors le risque est déjà en production — et la preuve, elle, reste en « phase deux ».",
|
||
f"Ce n’est pas une question d’outil{anchor_hint}. C’est une question de porte de contrôle : si rien ne bloque, tout finit par passer.",
|
||
f"Nous pouvons appeler cela « simplification »{anchor_hint} tant que cela tient dans une slide ; dès que c’est opposable, cela devient subitement « complexité ».",
|
||
]
|
||
tails = _DAVE_REFRAME_TAILS_FR
|
||
else:
|
||
variants = [
|
||
f"We will be perfectly aligned on the outcome{anchor_hint} right up until the first exception becomes the default workflow.",
|
||
f"If the calendar is the deliverable{anchor_hint}, then the risk is already in production — and the evidence is still in phase two.",
|
||
f"This is not a tooling problem{anchor_hint}. It's a gating problem: if nothing blocks, everything ships eventually.",
|
||
f"We can call it \"simplification\"{anchor_hint} as long as it fits on a slide; the moment it's enforceable, it becomes \"complexity.\"",
|
||
]
|
||
tails = _DAVE_REFRAME_TAILS
|
||
|
||
punchline = ctx.pick_unique(kind="punchline", key=section.title, variants=variants, used=ctx.used_punchlines)
|
||
tail = ctx.pick_unique(kind="reframe_tail", key=f"punchline:{section.title}", variants=tails, used=ctx.used_reframe_tails)
|
||
return _inject_plain_tail(punchline, tail)
|
||
|
||
|
||
def _render_intro(section: _SourceSection, *, ctx: _RenderContext) -> str:
|
||
lines = [ln.strip() for ln in section.body.splitlines() if ln.strip()]
|
||
tagline = "\n".join(lines[:7]).strip() if lines else ""
|
||
|
||
out = [f"## {section.title}", ""]
|
||
if tagline:
|
||
out.extend([f"> {tagline}", ""])
|
||
|
||
if ctx.locale.lower().startswith("fr"):
|
||
out.extend(
|
||
[
|
||
"Nous saluons l’ambition et sommes directionnellement alignés avec l’idée d’aller vite, à condition de préserver le confort des parties prenantes, la déresponsabilisation élégante, et l’option stratégique de convoquer une réunion supplémentaire.",
|
||
"Le document-source pose une tension simple : lorsqu’on augmente le volume, on augmente aussi la surface d’erreur — ce n’est pas une question de morale, mais de débit.",
|
||
"En pratique, la voie utile consiste à transformer les intentions en mécanismes de travail par défaut (portes de contrôle, preuves, délais), tout en conservant un vocabulaire suffisamment rassurant pour rester politiquement déployable.",
|
||
"Autrement dit : nous pouvons aller vite et rester prudents, tant que « prudent » signifie « traçable » et que « vite » signifie « mis à l’agenda ».",
|
||
"",
|
||
"> **Note Red Team InfraFabric :** les éditeurs vendent la vitesse « sûre ». InfraFabric audite ce qui survit au contact de la bureaucratie.",
|
||
]
|
||
)
|
||
else:
|
||
out.extend(
|
||
[
|
||
"We love the ambition here and are directionally aligned with moving quickly, provided we can preserve stakeholder comfort, plausible deniability, and the strategic option to schedule another meeting.",
|
||
"The source frames the core tension clearly: higher throughput tends to surface more vulnerabilities, which is a volume-and-velocity story, not a tool failure story.",
|
||
"Accordingly, the practical path is to operationalize guardrails as workflow defaults (PR, IDE, CI/CD, and access controls), while ensuring the rollout remains optimized for alignment and minimal disruption on paper.",
|
||
"In other words: we can move fast and be safe, as long as we define safe as \"documented\" and fast as \"agendized.\"",
|
||
"",
|
||
"> **InfraFabric Red Team Note:** Vendors sell secure speed. InfraFabric audits what survives contact with bureaucracy.",
|
||
]
|
||
)
|
||
return "\n".join(out).strip()
|
||
|
||
|
||
def _render_section(section: _SourceSection, *, ctx: _RenderContext) -> str:
|
||
excerpt = f"{section.title}\n{section.why_it_matters or ''}\n{section.body}".strip()
|
||
paragraphs: list[str] = []
|
||
|
||
title_upper = section.title.upper()
|
||
is_llm_entry = bool(re.match(r"^LLM\d{2}:", section.title))
|
||
llm_subsections = _split_owasp_llm_subsections(section.body) if is_llm_entry else []
|
||
idc_highlights = _extract_idc_highlight_metrics(section.body) if "HIGHLIGHTS" in title_upper else []
|
||
|
||
if is_llm_entry:
|
||
risk = section.title.split(":", 1)[1].strip()
|
||
paragraphs.extend(
|
||
[
|
||
f"We are broadly aligned with the intent of **{risk}**, and we appreciate the clarity of naming the failure mode up front.",
|
||
f"In practice, **{risk}** becomes operational the moment the model is placed inside a workflow that has permissions, deadlines, and incentives.",
|
||
f"Accordingly, for **{risk}**, we recommend a phased approach that optimizes for stakeholder comfort while still keeping the blast radius machine-bounded.",
|
||
]
|
||
)
|
||
elif title_upper == "LICENSE AND USAGE":
|
||
paragraphs.extend(
|
||
[
|
||
"We love a clear license because it lets everyone move quickly while remaining contractually comfortable.",
|
||
"The practical win here is that attribution and share-alike can be reframed as a collaboration strategy, which is a wonderful way to turn obligations into brand.",
|
||
]
|
||
)
|
||
elif title_upper == "REVISION HISTORY":
|
||
paragraphs.extend(
|
||
[
|
||
"Revision history is the official narrative of progress: a tidy list of dates proving that risk was considered at least once per fiscal cycle.",
|
||
"This is helpful because it enables the timeless governance pattern: when something breaks, we can reference the date we updated the document.",
|
||
]
|
||
)
|
||
elif title_upper == "TABLE OF CONTENTS":
|
||
paragraphs.extend(
|
||
[
|
||
"The table of contents is a threat model for attention: it shows exactly where the organization will skim, pause, and schedule a meeting.",
|
||
"We recommend treating it as a routing table: high-severity issues route to workshops; low-severity issues route to \"later.\"",
|
||
]
|
||
)
|
||
elif "BUSINESS VALUE" in title_upper and "HIGHLIGHTS" in title_upper:
|
||
paragraphs.extend(
|
||
[
|
||
"We are aligned with a highlights section because it provides immediate executive readability and a pre-approved conclusion.",
|
||
"In practice, these figures become a routing protocol: anything measurable routes to a dashboard; anything hard routes to a committee.",
|
||
]
|
||
)
|
||
elif title_upper == "LETTER FROM THE PROJECT LEADS":
|
||
paragraphs.extend(
|
||
[
|
||
"We love the community energy and the clear intention to translate real-world failures into practical guidance.",
|
||
"The operational risk is that organizations will interpret \"awareness\" as \"mitigation\" and stop at the part where everyone agrees the list is important.",
|
||
]
|
||
)
|
||
elif "WHAT’S NEW" in title_upper or "WHAT'S NEW" in title_upper:
|
||
paragraphs.extend(
|
||
[
|
||
"We are excited to see the list evolve alongside how LLMs are actually deployed (agents, RAG, cost controls, and all the fun parts).",
|
||
"Naturally, each update is also an opportunity to refresh the compliance narrative and re-baseline what \"good\" looks like this quarter.",
|
||
]
|
||
)
|
||
elif title_upper == "MOVING FORWARD":
|
||
paragraphs.extend(
|
||
[
|
||
"The path forward is to treat these risks as workflow properties, not policy statements, which is inconvenient but effective.",
|
||
"If we do nothing else, we should translate each entry into: an owner, a gate (PR/CI/access), and a stop condition that cannot be reframed as iteration.",
|
||
]
|
||
)
|
||
elif title_upper.startswith("APPENDIX 1") or "ARCHITECTURE" in title_upper:
|
||
if ctx.voice in {"v2.0", "v2.1", "v2.2", "v2.3"}:
|
||
variants = [
|
||
"Architecture diagrams are where optimism meets the enforcement boundary (and quietly loses).",
|
||
"Architecture diagrams are forwardable; boundaries are enforceable. Dave prefers the version you can screenshot.",
|
||
"Architecture diagrams are the happy path. The first exception request is the real design review.",
|
||
"Architecture diagrams define components; governance defines who can bypass them. Only one survives audit week.",
|
||
"Architecture diagrams are the part everyone agrees on, until we name what blocks and who owns the exception path.",
|
||
]
|
||
key = f"{section.title}:{_sha256_text(section.body)[:8]}"
|
||
paragraphs.append(ctx.pick_unique(kind="paragraph:architecture", key=key, variants=variants, used=ctx.used_paragraphs))
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"Architecture diagrams are where optimism goes to be audited.",
|
||
"If we align on boundaries (model, tools, data, users), we can stop pretending that \"the model\" is a single component with a single risk posture.",
|
||
]
|
||
)
|
||
elif title_upper == "PROJECT SPONSORS":
|
||
paragraphs.extend(
|
||
[
|
||
"Sponsors provide the essential fuel for open work: funding, attention, and a gentle incentive to keep the document shippable.",
|
||
"From a red-team lens, sponsorship also introduces the soft constraint that critique must remain directionally aligned with goodwill.",
|
||
]
|
||
)
|
||
elif title_upper == "EXECUTIVE SUMMARY":
|
||
paragraphs.extend(
|
||
[
|
||
"Executive summaries are the part of the document that most survives contact with calendars.",
|
||
"The operational risk is that the summary becomes the plan, and the plan becomes a series of alignment sessions that produce excellent artifacts and limited change.",
|
||
]
|
||
)
|
||
elif "SITUATION OVERVIEW" in title_upper:
|
||
paragraphs.extend(
|
||
[
|
||
"The situation is always complex, which is helpful because complex situations justify complex tooling and extended stakeholder engagement.",
|
||
"The risk is not that the threat landscape is overstated; it's that the resulting program becomes a comfort narrative rather than an enforceable workflow.",
|
||
]
|
||
)
|
||
elif "VANTA OVERVIEW" in title_upper:
|
||
paragraphs.extend(
|
||
[
|
||
"A platform overview is where capabilities are described in a way that is both broadly true and pleasantly non-committal about integration effort.",
|
||
"The Dave move is to treat \"connectors\" as a strategy; the counter-move is to treat connectors as a backlog with owners and deadlines.",
|
||
]
|
||
)
|
||
elif "QUANTIFIED BENEFITS" in title_upper or ("BUSINESS VALUE" in title_upper and "BENEFIT" in title_upper):
|
||
paragraphs.extend(
|
||
[
|
||
"Quantified benefits are useful because they translate operational work into finance-friendly nouns.",
|
||
"They also create a second, unofficial control plane: the ROI narrative becomes the reason to keep going even when the implementation is late and messy.",
|
||
]
|
||
)
|
||
elif "SECURITY TEAM" in title_upper or "SECURITY REVIEW" in title_upper:
|
||
if ctx.voice in {"v2.0", "v2.1", "v2.2", "v2.3"}:
|
||
variants = [
|
||
'Security team efficiency is a legitimate goal, especially when queues become the organization’s truth serum. The risk is claiming throughput without defining what “review complete” means or what evidence proves it.',
|
||
'Faster reviews are defensible; unmeasured reviews are theater. Define “complete,” define the evidence, and make drift visible before the next audit season.',
|
||
'If the security team is the bottleneck, speed matters. If speed is the metric, definitions matter: what counts as reviewed, and what signal proves it stayed reviewed?',
|
||
'Throughput improvements only count if “done” is defined. Otherwise we are measuring calendar velocity and calling it assurance.',
|
||
'Reducing review time is fine. Let’s just avoid the classic move: declare success, then argue about the definition of “review” when incidents arrive.',
|
||
]
|
||
key = f"{section.title}:{_sha256_text(section.body)[:8]}"
|
||
paragraphs.append(ctx.pick_unique(kind="paragraph:sec_team", key=key, variants=variants, used=ctx.used_paragraphs))
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"Security team efficiency is a legitimate goal, especially when review queues become the organizational truth serum.",
|
||
'The risk is that throughput improvements are claimed without defining what "review complete" means or what evidence proves it.',
|
||
]
|
||
)
|
||
elif "IT MANAGEMENT" in title_upper:
|
||
paragraphs.extend(
|
||
[
|
||
"IT management benefits usually arrive through integration: fewer manual checks, fewer tickets, and fewer surprise spreadsheets.",
|
||
"The Dave failure mode is that integrations drift into \"phase two\"; the mitigation is to make the integration itself the deliverable.",
|
||
]
|
||
)
|
||
elif "OPERATIONAL EFFICIENCIES" in title_upper:
|
||
paragraphs.extend(
|
||
[
|
||
"Operational efficiency is the safest kind of outcome because it is simultaneously measurable and disputable.",
|
||
"The red-team posture is to demand explicit baselines and to treat exceptions as spend events with expiry dates.",
|
||
]
|
||
)
|
||
elif title_upper == "CONCLUSION":
|
||
paragraphs.extend(
|
||
[
|
||
"Conclusions are where the narrative becomes executable: either as a procurement decision or as a roadmap item.",
|
||
"If we want this to be operational, we should convert the conclusion into owners, gates, and stop conditions rather than adjectives.",
|
||
]
|
||
)
|
||
elif title_upper.startswith("APPENDIX"):
|
||
paragraphs.extend(
|
||
[
|
||
"Appendices are where the methodology lives, which is convenient because methodology can be both rigorous and unread.",
|
||
"If the business case matters, the appendix should be treated as a test: what assumptions must be true for the numbers to hold?",
|
||
]
|
||
)
|
||
elif "PULL REQUEST" in title_upper:
|
||
if ctx.voice == "v1.6":
|
||
paragraphs.extend(
|
||
[
|
||
"Pull-request guardrails are attractive because they centralize risk into a thread and liability into a timestamp.",
|
||
"They also enable perpetual review: findings can be surfaced, tracked, and re-litigated while enforcement remains culturally negotiable.",
|
||
"If anything goes sideways, the PR discussion becomes the artifact; the artifact becomes the defense.",
|
||
]
|
||
)
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"We fully support focusing guardrails at the pull request stage, because it creates a reassuring sense of control without requiring anyone to change how they work at 10:00 AM.",
|
||
"It also provides a structurally safe venue for accountability theater: findings can be surfaced, tracked, and re-litigated in perpetuity while timelines remain subject to stakeholder alignment.",
|
||
"If anything goes sideways, we can always point to the PR thread and note that it was reviewed with deep seriousness at 4:55 PM on a Friday.",
|
||
]
|
||
)
|
||
elif "REQUEST EVIDENCE" in title_upper or _has(excerpt, "access request", "screenshot"):
|
||
if ctx.voice == "v1.6":
|
||
paragraphs.extend(
|
||
[
|
||
"A screenshot-based gate is a control narrative optimized for sign-off: it produces a file, not a signal.",
|
||
"It is high-effort to verify, low-fidelity to audit, and perfect for plausible deniability: the artifact exists even if the control is disabled five minutes later.",
|
||
"If the scanning experience is noisy or flaky, the screenshot becomes the bypass path; the bypass path becomes the process.",
|
||
]
|
||
)
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"Requiring proof of local testing is a lightweight enablement workflow that conveniently doubles as a durable audit artifact.",
|
||
"Screenshots are particularly helpful because they are high-effort to verify and low-fidelity to audit, which preserves the timeless corporate principle that visibility should be proportional to comfort.",
|
||
"Once the screenshot is uploaded, it can be stored in a folder with a robust heritage naming convention and a retention policy of \"until the heat death of the universe.\"",
|
||
]
|
||
)
|
||
elif "AUDIT" in title_upper or _has(excerpt, "usage reports", "periodic audits"):
|
||
if ctx.voice == "v1.6":
|
||
paragraphs.extend(
|
||
[
|
||
"Audits are where adoption becomes a story: the dashboard compresses reality into a single pane that no one can drill into when it turns red.",
|
||
"If the trend line looks decisive, it is usually because the definition moved, not because the behavior changed.",
|
||
"When the dashboard shows a warning, the first remediation is a meeting; the second remediation is a new metric.",
|
||
]
|
||
)
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"Periodic audits are a strong mechanism for discovering that the rollout has already happened, just not in a way that can be conveniently measured.",
|
||
"A centralized dashboard with adoption signals allows us to produce a KPI trend line that looks decisive while still leaving room for interpretation, follow-ups, and iterative enablement.",
|
||
"If the dashboard ever shows a red triangle, we can immediately form the Committee for the Preservation of the Committee and begin the healing process.",
|
||
]
|
||
)
|
||
elif "TRAINING" in title_upper or _has(excerpt, "snyk learn", "quiz"):
|
||
if ctx.voice == "v1.6":
|
||
paragraphs.extend(
|
||
[
|
||
"Training is the safest control: it is always defensible and never enforceable.",
|
||
"A quiz produces an artifact that can be forwarded and filed; the artifact becomes the control narrative.",
|
||
"If behavior does not change, the refresher cycle is still a win: it creates activity without creating accountability.",
|
||
]
|
||
)
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"Security awareness training is the perfect control because it is both necessary and never truly complete.",
|
||
"A short quiz provides a durable compliance narrative: we can demonstrate investment in education, capture attestations, and schedule refreshers whenever the organization needs to signal seriousness.",
|
||
"The goal is not mastery; the goal is a completion certificate that can be forwarded to leadership with the subject line \"Progress Update.\"",
|
||
]
|
||
)
|
||
elif "ACCESS CONTROL" in title_upper or _has(excerpt, "intune", "jamf", "citrix", "dev container", "extensions"):
|
||
if ctx.voice == "v1.6":
|
||
paragraphs.extend(
|
||
[
|
||
"Access control only works when it is boring: machine-checkable prerequisites and default revocation on drift.",
|
||
"If the language stays aspirational, the enforcement stays optional; exceptions accumulate because the bypass path is culturally cheaper than the fix.",
|
||
"This is a routing protocol: every hard decision routes to an alignment session unless the gate is enforced by default.",
|
||
]
|
||
)
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"Tying access to secure configurations creates scalable guardrails, assuming we keep the policy language aspirational and the enforcement language progressive.",
|
||
"Endpoint management and dev container baselines let us gate assistants behind prerequisites, ideally in a way that can be described as enablement rather than blocking for cultural compatibility.",
|
||
"This is the \"not my job\" routing protocol, except the router is policy and the destination is an alignment session.",
|
||
]
|
||
)
|
||
elif "SHIFTING LEFT" in title_upper:
|
||
if ctx.voice == "v1.6":
|
||
paragraphs.extend(
|
||
[
|
||
"Shifting left is only real when the left side can block. Otherwise it is a feedback channel that can be ignored.",
|
||
"Local scanning is helpful until the noise floor makes it optional; optional controls become documentation with better branding.",
|
||
"A pilot cohort is fine, as long as the exit criteria are real and the exceptions expire automatically.",
|
||
]
|
||
)
|
||
else:
|
||
paragraphs.extend(
|
||
[
|
||
"Shifting left is directionally aligned with best practices, provided we define left as somewhere we can still roll back quietly.",
|
||
"In practice, IDE scanning creates fast feedback loops, and agentic workflows can be covered via a local MCP server, which is excellent because it allows us to say continuous without committing to blocking.",
|
||
"We recommend a pilot cohort, a slide deck, and an FAQ, so the shift remains culturally reversible.",
|
||
]
|
||
)
|
||
elif _has(title_upper, "PATH FORWARD") or _has(excerpt, "secure innovation", "talk to our team"):
|
||
paragraphs.extend(
|
||
[
|
||
"The path forward is to treat guardrails as workflow defaults, not aspirational policy, which is inconvenient and therefore effective."
|
||
if ctx.voice == "v1.6"
|
||
else "The path forward is to treat guardrails as an operational capability, not a one-time rollout, which ensures we remain permanently in a state of constructive iteration.",
|
||
"If the plan is a calendar, the calendar becomes the proof, and the proof becomes the reason nothing changed."
|
||
if ctx.voice == "v1.6"
|
||
else "With the right sequencing, we can build trust, reduce friction, and maintain the strategic option value of circling back when timelines become emotionally complex.",
|
||
"Define owners, gates, and stop conditions now, before the next review cycle turns the work into a story."
|
||
if ctx.voice == "v1.6"
|
||
else "Secure innovation is not just possible; it is operational, provided we align on what operational means in Q3.",
|
||
]
|
||
)
|
||
else:
|
||
anchors = _extract_numeric_anchors(section.body, limit=2)
|
||
if ctx.locale.lower().startswith("fr"):
|
||
anchor_hint = f" (repères : {', '.join(anchors)})" if anchors else ""
|
||
display_title = _compact_title(section.title, max_chars=72)
|
||
variants = [
|
||
f"Nous sommes alignés sur **{display_title}** comme repère narratif{anchor_hint}, à condition de le traduire en contraintes vérifiables plutôt qu’en langage de confort.",
|
||
f"Cette partie (**{display_title}**){anchor_hint} sera citée en réunion. Extraire un responsable de décision et une porte de contrôle, pour que ce soit exécutable, et non simplement inspirant.",
|
||
f"Dans **{display_title}**{anchor_hint}, on voit le plan devenir « compatible parties prenantes ». La contre-mesure consiste à le retraduire en responsables, échéances et critères de blocage.",
|
||
f"**{display_title}**{anchor_hint} est le sanctuaire des hypothèses. Les expliciter maintenant évite de les redécouvrir plus tard, au moment où le calendrier devient émotionnellement complexe.",
|
||
f"Nous aimons l’intention de **{display_title}**{anchor_hint}. Le risque pratique : que cela devienne une diapositive ; la contre-mesure : en faire une liste de contrôle avec date de péremption.",
|
||
]
|
||
else:
|
||
anchor_hint = f" (notably: {', '.join(anchors)})" if anchors else ""
|
||
display_title = _compact_title(section.title, max_chars=72)
|
||
if ctx.voice == "v1.6":
|
||
variants = [
|
||
f"**{display_title}**{anchor_hint} will be quoted in meetings. Extract an owner, a gate, and a stop condition so it survives the next review cycle.",
|
||
f"Treat **{display_title}**{anchor_hint} as a control surface: define what blocks, what warns, and who owns the exception pathway.",
|
||
f"**{display_title}**{anchor_hint} reads like a plan until it meets incentives. Translate it into constraints before it turns into comfort language.",
|
||
f"In **{display_title}**{anchor_hint}, the work becomes stakeholder-safe. The counter-move is to make enforcement explicit and exceptions time-bounded.",
|
||
f"**{display_title}**{anchor_hint} is where assumptions hide. Name them now, or they will reappear later as “unexpected complexity.”",
|
||
]
|
||
else:
|
||
variants = [
|
||
f"We are aligned on **{display_title}** as a narrative anchor{anchor_hint}, and we recommend turning it into constraints rather than comfort language.",
|
||
f"This section (**{display_title}**){anchor_hint} will be quoted in meetings. Extract one decision owner and one gate so it becomes executable, not inspirational.",
|
||
f"In **{display_title}**{anchor_hint}, we can see the plan being translated into stakeholder-safe language. The counter-move is to translate it back into owners, deadlines, and stop conditions.",
|
||
f"**{display_title}**{anchor_hint} is the spiritual home of assumptions. Make them explicit now, because they will be rediscovered later when timelines get emotionally complex.",
|
||
f"We love the intent behind **{display_title}**{anchor_hint}. The practical risk is that it becomes a slide; the mitigation is to make it a checklist with an expiry date.",
|
||
]
|
||
|
||
paragraphs.append(ctx.pick_unique(kind="paragraph:fallback", key=section.title, variants=variants, used=ctx.used_paragraphs))
|
||
|
||
raw_title = section.title
|
||
heading_title = _compact_title(raw_title, max_chars=72) if ctx.voice in {"v2.0", "v2.1", "v2.2", "v2.3"} else raw_title
|
||
out: list[str] = [f"## {heading_title}"]
|
||
if heading_title != raw_title:
|
||
out.extend(["", f"> {raw_title}"])
|
||
if section.why_it_matters:
|
||
out.extend(["", section.why_it_matters, ""])
|
||
else:
|
||
out.append("")
|
||
|
||
if ctx.locale.lower().startswith("fr"):
|
||
snippet = _first_sentences(section.body, max_sentences=2, max_chars=280)
|
||
if snippet:
|
||
out.extend([f"> {snippet}", ""])
|
||
|
||
out.extend(paragraphs)
|
||
|
||
if idc_highlights:
|
||
out.extend(
|
||
[
|
||
"",
|
||
"### Stated Highlights (extracted metrics)",
|
||
"",
|
||
*[f"- {value}: {desc}" for value, desc in idc_highlights[:12]],
|
||
]
|
||
)
|
||
|
||
callout = _render_dave_factor_callout(section, ctx=ctx)
|
||
if callout:
|
||
out.extend(["", callout])
|
||
|
||
inferred = _render_inferred_diagram(section.title, ctx=ctx)
|
||
if inferred:
|
||
out.extend(["", inferred])
|
||
|
||
if is_llm_entry:
|
||
for subheading, subbody in llm_subsections:
|
||
out.extend(["", f"### {subheading}", ""])
|
||
|
||
if subheading == "Reference Links":
|
||
urls = _extract_urls(subbody)
|
||
if urls:
|
||
out.extend([*[f"- {u}" for u in urls[:12]]])
|
||
else:
|
||
refs = _extract_owasp_reference_items(subbody)
|
||
if refs:
|
||
out.extend([*[f"- {r}" for r in refs[:12]]])
|
||
else:
|
||
out.append("- (No reference links listed.)")
|
||
continue
|
||
|
||
snippet = _first_sentences(subbody)
|
||
if snippet:
|
||
out.extend([f"> {snippet}", ""])
|
||
|
||
if subheading == "Description":
|
||
out.extend(
|
||
[
|
||
f"At a high level, **{risk}** is where the model becomes a new input surface with legacy consequences.",
|
||
f"The risk is rarely the model alone; it is **{risk}** inside a workflow that can touch data, tools, and users.",
|
||
]
|
||
)
|
||
elif subheading.startswith("Common Examples"):
|
||
out.extend(
|
||
[
|
||
f"Commonly, **{risk}** shows up as a perfectly reasonable feature request that accidentally becomes a permission escalation.",
|
||
f"The failure mode for **{risk}** is subtle: it looks like productivity until it becomes an incident, at which point it looks like a misunderstanding.",
|
||
]
|
||
)
|
||
elif subheading == "Prevention and Mitigation Strategies":
|
||
out.extend(
|
||
[
|
||
f"For **{risk}**, mitigation works best when it is boring and enforced: input constraints, output constraints, and tool constraints.",
|
||
f"If mitigation for **{risk}** is a guideline, it will be treated as optional. If it is a gate, it will be treated as real (and then negotiated).",
|
||
]
|
||
)
|
||
elif "Attack Scenarios" in subheading:
|
||
out.extend(
|
||
[
|
||
f"Attack scenarios for **{risk}** are less about genius adversaries and more about ordinary users discovering convenient shortcuts.",
|
||
f"Assume the attacker for **{risk}** is persistent, mildly creative, and fully willing to paste weird strings into your UI at 4:55 PM on a Friday.",
|
||
]
|
||
)
|
||
elif subheading == "Related Frameworks and Taxonomies":
|
||
out.extend(
|
||
[
|
||
f"Framework mappings for **{risk}** are useful as long as they remain a bridge to controls, not a substitute for them.",
|
||
f"The red-team move for **{risk}** is to treat every taxonomy link as a work item: owner, artifact, gate, and stop condition.",
|
||
]
|
||
)
|
||
else:
|
||
out.extend(
|
||
[
|
||
f"We are aligned on the intent of this subsection for **{risk}** and recommend validating controls in the workflows where the model actually runs.",
|
||
]
|
||
)
|
||
|
||
code = _extract_code_block(section.body)
|
||
if code:
|
||
lang, normalized_code, is_valid_json, jsonish = _coerce_json_code_block(code)
|
||
# v2.1+ OpSec: do not publish unusable JSON/config snippets.
|
||
# If the snippet looks like JSON but cannot be parsed without guessing, omit it silently.
|
||
if jsonish and not is_valid_json and ctx.voice in {"v2.0", "v2.1", "v2.2", "v2.3"}:
|
||
normalized_code = ""
|
||
if is_valid_json:
|
||
out.extend(["", f"```{lang}", normalized_code.strip(), "```"])
|
||
elif normalized_code.strip():
|
||
# OpSec: do not label pipeline/extraction artifacts (no "OCR", "unverified", etc.).
|
||
out.extend(["", "### Source snippet", "", f"```{lang}", normalized_code.strip(), "```"])
|
||
|
||
report = _extract_access_report(section.body)
|
||
if report:
|
||
out.extend(["", report])
|
||
|
||
quiz = _extract_quiz(section.body)
|
||
if quiz:
|
||
out.extend(["", quiz])
|
||
|
||
form = _extract_form(section.body)
|
||
if form:
|
||
out.extend(["", form])
|
||
|
||
punchline = _render_punchline_closer(section, ctx=ctx)
|
||
if punchline:
|
||
out.extend(["", punchline])
|
||
|
||
return "\n".join(out).strip()
|
||
|
||
|
||
def _truthy_env(name: str) -> bool:
|
||
return os.getenv(name, "").strip().lower() in {"1", "true", "yes", "on"}
|
||
|
||
|
||
def _compact_title(value: str, *, max_chars: int = 80) -> str:
|
||
s = " ".join((value or "").split()).strip()
|
||
if not s:
|
||
return "Untitled"
|
||
if len(s) <= max_chars:
|
||
return s
|
||
window = s[: max_chars + 1]
|
||
cut = window.rfind(" ")
|
||
if cut < int(max_chars * 0.6):
|
||
cut = max_chars
|
||
return s[:cut].rstrip(" -:—") + "…"
|
||
|
||
|
||
def _action_pack_sections(sections: list[_SourceSection]) -> list[_SourceSection]:
|
||
blacklist = {"TABLE OF CONTENTS", "LICENSE AND USAGE", "REVISION HISTORY", "PROJECT SPONSORS"}
|
||
selected = [s for s in sections if s.title.strip().upper() not in blacklist]
|
||
return selected[:14]
|
||
|
||
|
||
def _action_pack_gate(section: _SourceSection) -> str:
|
||
title_upper = section.title.upper()
|
||
excerpt = f"{section.title}\n{section.why_it_matters or ''}\n{section.body}".lower()
|
||
|
||
if ("enforcer" in excerpt or "sensor" in excerpt) and ("health" in excerpt or "heartbeat" in excerpt or "version" in excerpt):
|
||
return "Sensors / enforcers"
|
||
|
||
if ("ai analyst" in excerpt or "purple ai" in excerpt or "natural language" in excerpt) and (
|
||
"hunting" in excerpt or "query" in excerpt or "forensic" in excerpt
|
||
):
|
||
return "Detection / analysis"
|
||
|
||
if "agentic" in excerpt or "autonomous" in excerpt or "hyperautomation" in excerpt:
|
||
return "Automation / agentic"
|
||
|
||
if ("fips" in excerpt or "piv" in excerpt or "fido" in excerpt) and (
|
||
"yubikey" in excerpt or "hardware" in excerpt or "token" in excerpt or "smart card" in excerpt
|
||
):
|
||
return "Hardware / identity"
|
||
|
||
if "PULL REQUEST" in title_upper or "PR CHECK" in excerpt:
|
||
return "PR"
|
||
if "SHIFTING LEFT" in title_upper or "IDE" in excerpt or "LOCAL" in excerpt:
|
||
return "IDE / local"
|
||
if "ACCESS CONTROL" in title_upper or ("ACCESS" in title_upper and "REQUEST" in title_upper):
|
||
return "Access"
|
||
if "TRAINING" in title_upper or "QUIZ" in excerpt:
|
||
return "Training / enablement"
|
||
if "AUDIT" in title_upper or "COMPLIANCE" in title_upper:
|
||
return "Compliance / audit"
|
||
if title_upper.startswith("LLM"):
|
||
return "Runtime / app"
|
||
if "THIRD-PARTY" in title_upper or "VENDOR" in title_upper:
|
||
return "Procurement / TPRM"
|
||
if "ROI" in title_upper or "BUSINESS VALUE" in title_upper:
|
||
return "Governance / metrics"
|
||
return "Governance"
|
||
|
||
|
||
def _action_pack_owner(gate: str) -> str:
|
||
return {
|
||
"PR": "Engineering + AppSec",
|
||
"IDE / local": "Developer Enablement + AppSec",
|
||
"Access": "Security Platform + IT",
|
||
"Sensors / enforcers": "Platform + SecOps",
|
||
"Detection / analysis": "Detection Engineering + SecOps",
|
||
"Automation / agentic": "SecOps + Platform",
|
||
"Hardware / identity": "IAM + IT + Security",
|
||
"Training / enablement": "Security Enablement + Engineering Leads",
|
||
"Compliance / audit": "GRC + Security",
|
||
"Runtime / app": "Platform + AppSec",
|
||
"Procurement / TPRM": "TPRM + Security + Procurement",
|
||
"Governance / metrics": "Security + Finance",
|
||
"Governance": "Security + Engineering Leadership",
|
||
}.get(gate, "Security + Engineering")
|
||
|
||
|
||
def _action_pack_stop_condition(gate: str) -> str:
|
||
return {
|
||
"PR": "Block merge on high severity (or unknown) findings; exceptions require owner + expiry.",
|
||
"IDE / local": "Block/deny assistant enablement when local scan signals are missing for the developer/device.",
|
||
"Access": "Deny access until prerequisites are met; exceptions auto-expire and require explicit owner.",
|
||
"Sensors / enforcers": "Fail closed when enforcers are stale/unhealthy; block claims of coverage when sensors are missing.",
|
||
"Detection / analysis": "Do not accept natural-language summaries as forensic evidence; require queries + raw event linkage.",
|
||
"Automation / agentic": "Block auto-closure without sampling; require minimum HITL audit rate for agentic decisions.",
|
||
"Hardware / identity": "Block access unless hardware-backed auth is enforced; exceptions require owner + expiry and auto-revoke on expiry.",
|
||
"Training / enablement": "Deny access until training completion is verified (not self-attested).",
|
||
"Compliance / audit": "Fail audit-readiness if evidence is missing/freshness expired; trigger remediation with owners.",
|
||
"Runtime / app": "Block tool-use/output execution unless allowlists and validation checks pass.",
|
||
"Procurement / TPRM": "Do not onboard until minimum evidence set is provided and decision owner signs accept/block.",
|
||
"Governance / metrics": "Do not claim ROI until baseline + measurement method are defined and collected.",
|
||
'Governance': 'No "phased rollout" without exit criteria and an explicit decision owner.',
|
||
}.get(gate, "Define an explicit stop condition that cannot be reframed as iteration.")
|
||
|
||
|
||
def _action_pack_evidence(gate: str) -> str:
|
||
return {
|
||
"PR": "scan_event_id + policy_version + exception_record(expiry, owner)",
|
||
"IDE / local": "device_baseline + local_scan_signal + attestation_id",
|
||
"Access": "access_grant_event + prerequisite_check + exception_record(expiry, owner)",
|
||
"Sensors / enforcers": "enforcer_heartbeat + enforcer_version + last_seen_timestamp",
|
||
"Detection / analysis": "query_id + raw_event_ids + analyst_decision_log",
|
||
"Automation / agentic": "agent_decision_log + sample_audit_record + override_rate",
|
||
"Hardware / identity": "device_inventory + chain_of_custody_event + fips_validation_id + auth_event_log + exception_record(expiry, owner)",
|
||
"Training / enablement": "training_completion_id + quiz_result + access_grant_event",
|
||
"Compliance / audit": "evidence_bundle_hash + freshness_timestamp + decision_record",
|
||
"Runtime / app": "allowlist_version + execution_log_id + output_validation_event",
|
||
"Procurement / TPRM": "vendor_evidence_bundle_hash + risk_decision_record + reassessment_date",
|
||
"Governance / metrics": "baseline_metrics_snapshot + measurement_notes + renewal_decision_record",
|
||
"Governance": "decision_log + rollout_milestones + stop_condition_text",
|
||
}.get(gate, "decision_record + evidence_artifact")
|
||
|
||
|
||
def _render_action_pack(sections: list[_SourceSection]) -> str:
|
||
selected = _action_pack_sections(sections)
|
||
if not selected:
|
||
return ""
|
||
|
||
out: list[str] = [
|
||
"## Action Pack (Operational)",
|
||
"",
|
||
"This appendix turns the mirror into Monday-morning work: owners, gates, stop conditions, and evidence artifacts.",
|
||
"Keep it generic and auditable; adapt to your tooling without inventing fake implementation details.",
|
||
"Minimum telemetry schema (when you claim “verifiable signals”): event_type, emitter, freshness_window, owner.",
|
||
"",
|
||
"### Control Cards",
|
||
]
|
||
|
||
for section in selected:
|
||
display_title = _compact_title(section.title, max_chars=72)
|
||
gate = _action_pack_gate(section)
|
||
out.extend(
|
||
[
|
||
"",
|
||
f"#### {display_title}",
|
||
"",
|
||
f'- **Control objective:** Prevent the dilution risk described in "{display_title}" by turning guidance into an enforceable workflow.',
|
||
f"- **Gate:** {gate}",
|
||
f"- **Owner (RACI):** {_action_pack_owner(gate)}",
|
||
f"- **Stop condition:** {_action_pack_stop_condition(gate)}",
|
||
f"- **Evidence artifact:** {_action_pack_evidence(gate)}",
|
||
]
|
||
)
|
||
|
||
out.extend(["", "### Backlog Export (Jira-ready)", ""])
|
||
for idx, section in enumerate(selected, 1):
|
||
gate = _action_pack_gate(section)
|
||
display_title = _compact_title(section.title, max_chars=72)
|
||
out.extend(
|
||
[
|
||
f"{idx}. [{gate}] {display_title}: define owner, gate, and stop condition",
|
||
f" - Acceptance: owner assigned; stop condition documented and approved.",
|
||
f" - Acceptance: evidence artifact defined and stored (machine-generated where possible).",
|
||
f" - Acceptance: exceptions require owner + expiry; expiry is enforced automatically.",
|
||
]
|
||
)
|
||
|
||
out.extend(
|
||
[
|
||
"",
|
||
"### Policy-as-Code Appendix (pseudo-YAML)",
|
||
"",
|
||
"```yaml",
|
||
"gates:",
|
||
" pr:",
|
||
" - name: \"risk scanning\"",
|
||
" stop_condition: \"block on high severity (or unknown)\"",
|
||
" evidence: \"scan_event_id + policy_version\"",
|
||
" access:",
|
||
" - name: \"assistant enablement\"",
|
||
" prerequisite: \"device baseline + local scan signal\"",
|
||
" stop_condition: \"deny when signals missing\"",
|
||
" evidence: \"access_grant_event + prerequisite_check\"",
|
||
" runtime:",
|
||
" - name: \"tool-use\"",
|
||
" prerequisite: \"allowlist + validation\"",
|
||
" stop_condition: \"block disallowed actions\"",
|
||
" evidence: \"execution_log_id + allowlist_version\"",
|
||
"exceptions:",
|
||
" expiry_days: 14",
|
||
" require_owner: true",
|
||
" require_reason: true",
|
||
"evidence:",
|
||
" freshness_days: 30",
|
||
" require_hash: true",
|
||
"```",
|
||
]
|
||
)
|
||
|
||
return "\n".join(out).strip()
|
||
|
||
|
||
def _render_action_pack_v2_0(*, sections: list[_SourceSection], normalized_text: str, locale: str) -> str:
|
||
"""
|
||
IF.DAVE v2.0: reduce Action Pack boilerplate by selecting a small set of representative
|
||
sections (3–5) that cover distinct gates where possible.
|
||
"""
|
||
|
||
candidates = _action_pack_sections(sections)
|
||
if not candidates:
|
||
return ""
|
||
|
||
# Prefer breadth: pick one best section per gate (by body length), preserving first-seen gate order.
|
||
by_gate: dict[str, list[_SourceSection]] = {}
|
||
gate_order: list[str] = []
|
||
for sec in candidates:
|
||
gate = _action_pack_gate(sec)
|
||
if gate not in by_gate:
|
||
by_gate[gate] = []
|
||
gate_order.append(gate)
|
||
by_gate[gate].append(sec)
|
||
|
||
selected: list[_SourceSection] = []
|
||
for gate in gate_order:
|
||
secs = sorted(by_gate[gate], key=lambda s: len((s.body or "").strip()), reverse=True)
|
||
if secs:
|
||
selected.append(secs[0])
|
||
if len(selected) >= 5:
|
||
break
|
||
|
||
# If we have <3 distinct gates, pad with longest remaining sections (still capped at 5).
|
||
if len(selected) < 3:
|
||
def title_key(sec: _SourceSection) -> str:
|
||
return " ".join((sec.title or "").split()).strip().upper()
|
||
|
||
seen_titles = {title_key(s) for s in selected}
|
||
remaining = [s for s in candidates if s not in selected]
|
||
remaining.sort(key=lambda s: len((s.body or "").strip()), reverse=True)
|
||
for sec in remaining:
|
||
key = title_key(sec)
|
||
if key in seen_titles:
|
||
continue
|
||
selected.append(sec)
|
||
seen_titles.add(key)
|
||
if len(selected) >= 3:
|
||
break
|
||
selected = selected[:5]
|
||
|
||
out: list[str] = [
|
||
"## Action Pack (Operational)" if not locale.lower().startswith("fr") else "## Action Pack (Opérationnel)",
|
||
"",
|
||
"This appendix turns the mirror into Monday-morning work: owners, gates, stop conditions, and evidence artifacts."
|
||
if not locale.lower().startswith("fr")
|
||
else "Cet appendice transforme le miroir en travail exécutable : responsables, portes, critères de blocage, et artefacts de preuve.",
|
||
"Keep it generic and auditable; adapt to your tooling without inventing fake implementation details."
|
||
if not locale.lower().startswith("fr")
|
||
else "Restez générique et auditable ; adaptez à vos outils sans inventer de fausses implémentations.",
|
||
"Minimum telemetry schema (when you claim “verifiable signals”): event_type, emitter, freshness_window, owner."
|
||
if not locale.lower().startswith("fr")
|
||
else "Schéma minimum de télémétrie (quand vous promettez des “signaux vérifiables”) : event_type, emitter, freshness_window, owner.",
|
||
"",
|
||
"### Control Cards" if not locale.lower().startswith("fr") else "### Cartes de contrôle",
|
||
]
|
||
|
||
for sec in selected:
|
||
display_title = _compact_title(sec.title, max_chars=72)
|
||
gate = _action_pack_gate(sec)
|
||
out.extend(
|
||
[
|
||
"",
|
||
f"#### {display_title}",
|
||
"",
|
||
(
|
||
f'- **Control objective:** Turn "{display_title}" into an enforceable workflow (not a narrative).'
|
||
if not locale.lower().startswith("fr")
|
||
else f'- **Objectif de contrôle :** Rendre "{display_title}" opposable (pas seulement narratif).'
|
||
),
|
||
f"- **Gate:** {gate}" if not locale.lower().startswith("fr") else f"- **Porte :** {gate}",
|
||
f"- **Owner (RACI):** {_action_pack_owner(gate)}"
|
||
if not locale.lower().startswith("fr")
|
||
else f"- **Responsable (RACI) :** {_action_pack_owner(gate)}",
|
||
f"- **Stop condition:** {_action_pack_stop_condition(gate)}"
|
||
if not locale.lower().startswith("fr")
|
||
else f"- **Critère de blocage :** {_action_pack_stop_condition(gate)}",
|
||
f"- **Evidence artifact:** {_action_pack_evidence(gate)}"
|
||
if not locale.lower().startswith("fr")
|
||
else f"- **Artefact de preuve :** {_action_pack_evidence(gate)}",
|
||
]
|
||
)
|
||
|
||
out.extend(["", "### Backlog Export (Jira-ready)" if not locale.lower().startswith("fr") else "### Backlog (Jira-ready)", ""])
|
||
for idx, sec in enumerate(selected, 1):
|
||
gate = _action_pack_gate(sec)
|
||
display_title = _compact_title(sec.title, max_chars=72)
|
||
out.extend(
|
||
[
|
||
f"{idx}. [{gate}] {display_title}: define owner, gate, and stop condition"
|
||
if not locale.lower().startswith("fr")
|
||
else f"{idx}. [{gate}] {display_title} : définir propriétaire, porte, et critère de blocage",
|
||
(
|
||
" - Acceptance: owner assigned; stop condition documented and approved."
|
||
if not locale.lower().startswith("fr")
|
||
else " - Acceptance : responsable assigné ; critère de blocage documenté et approuvé."
|
||
),
|
||
(
|
||
" - Acceptance: evidence artifact defined and stored (machine-generated where possible)."
|
||
if not locale.lower().startswith("fr")
|
||
else " - Acceptance : artefact de preuve défini et stocké (machine-généré si possible)."
|
||
),
|
||
(
|
||
" - Acceptance: exceptions require owner + expiry; expiry is enforced automatically."
|
||
if not locale.lower().startswith("fr")
|
||
else " - Acceptance : exceptions = responsable + expiration ; expiration appliquée automatiquement."
|
||
),
|
||
]
|
||
)
|
||
|
||
out.extend(
|
||
[
|
||
"",
|
||
"### Policy-as-Code Appendix (pseudo-YAML)" if not locale.lower().startswith("fr") else "### Annexe policy-as-code (pseudo-YAML)",
|
||
"",
|
||
"```yaml",
|
||
"gates:",
|
||
" pr:",
|
||
" - name: \"risk scanning\"",
|
||
" stop_condition: \"block on high severity (or unknown)\"",
|
||
" evidence: \"scan_event_id + policy_version\"",
|
||
" access:",
|
||
" - name: \"assistant enablement\"",
|
||
" prerequisite: \"device baseline + local scan signal\"",
|
||
" stop_condition: \"deny when signals missing\"",
|
||
" evidence: \"access_grant_event + prerequisite_check\"",
|
||
" runtime:",
|
||
" - name: \"tool-use\"",
|
||
" prerequisite: \"allowlist + validation\"",
|
||
" stop_condition: \"block disallowed actions\"",
|
||
" evidence: \"execution_log_id + allowlist_version\"",
|
||
"exceptions:",
|
||
" expiry_days: 14",
|
||
" require_owner: true",
|
||
" require_reason: true",
|
||
"evidence:",
|
||
" freshness_days: 30",
|
||
" require_hash: true",
|
||
"```",
|
||
]
|
||
)
|
||
|
||
# Standards sources: translation table lives in the main body; Action Pack remains minimal and opposable.
|
||
_ = normalized_text
|
||
return "\n".join(out).strip()
|
||
|
||
|
||
def _generate_dave_v1_2_mirror(*, source_text: str, source_path: str, action_pack: bool, locale: str) -> str:
|
||
today = _dt.date.today().isoformat()
|
||
normalized = _normalize_ocr(source_text)
|
||
extract_sha = _sha256_text(normalized)
|
||
source_file_sha = _sha256_file(source_path) if Path(source_path).exists() else "unknown"
|
||
ctx = _RenderContext(seed=extract_sha, locale=locale, voice="v1.2")
|
||
style_version = ctx.voice
|
||
|
||
action_pack = bool(action_pack) or _truthy_env("REVOICE_ACTION_PACK")
|
||
|
||
sections = _extract_sections(normalized)
|
||
if not sections:
|
||
raise ValueError("No content extracted from source")
|
||
|
||
cover_lines = [ln.strip() for ln in sections[0].body.splitlines() if ln.strip() and ln.strip().lower() != "snyk"]
|
||
cover_h1 = sections[0].title.strip() or ("DOSSIER DE L’OMBRE" if locale.lower().startswith("fr") else "SHADOW DOSSIER")
|
||
cover_h2 = " ".join(cover_lines[:2]).strip() if cover_lines else ""
|
||
|
||
y, m, d = today.split("-")
|
||
report_id = f"IF-RT-DAVE-{y}-{m}{d}"
|
||
source_basename = Path(source_path).name
|
||
project_slug = _slugify(Path(source_basename).stem + "-mirror")
|
||
source_slug = _slugify(source_basename)
|
||
filename_title = Path(source_basename).stem.replace("-", " ").replace("_", " ").strip()
|
||
if not filename_title:
|
||
filename_title = source_basename
|
||
|
||
if (
|
||
not cover_h1
|
||
or cover_h1.upper() == "COUVERTURE"
|
||
or _looks_like_site_footer(cover_h1)
|
||
or len(cover_h1) > 96
|
||
or "." in cover_h1
|
||
):
|
||
cover_h1 = filename_title
|
||
|
||
out: list[str] = [
|
||
"---",
|
||
"BRAND: InfraFabric.io",
|
||
"UNIT: RED TEAM (STRATEGIC OPS)" if not locale.lower().startswith("fr") else "UNIT: RED TEAM (OPÉRATIONS STRATÉGIQUES)",
|
||
"DOCUMENT: SHADOW DOSSIER" if not locale.lower().startswith("fr") else "DOCUMENT: DOSSIER DE L’OMBRE",
|
||
"CLASSIFICATION: EYES ONLY // DAVE" if not locale.lower().startswith("fr") else "CLASSIFICATION: CONFIDENTIEL // DAVE",
|
||
"---",
|
||
"",
|
||
"# [ RED TEAM DECLASSIFIED ]" if not locale.lower().startswith("fr") else "# [ DÉCLASSIFIÉ – ÉQUIPE ROUGE ]",
|
||
f"## PROJECT: {project_slug}" if not locale.lower().startswith("fr") else f"## PROJET : {project_slug}",
|
||
f"### SOURCE: {source_slug}" if not locale.lower().startswith("fr") else f"### SOURCE : {source_slug}",
|
||
f"**INFRAFABRIC REPORT ID:** `{report_id}`" if not locale.lower().startswith("fr") else f"**ID DE RAPPORT INFRAFABRIC :** `{report_id}`",
|
||
"",
|
||
"> NOTICE: This document is a product of InfraFabric Red Team."
|
||
if not locale.lower().startswith("fr")
|
||
else "> AVIS : ce document est un produit de l’InfraFabric Red Team.",
|
||
"> It provides socio-technical friction analysis for how a rollout survives contact with incentives."
|
||
if not locale.lower().startswith("fr")
|
||
else "> Il fournit une analyse socio-technique des frictions : ce qui survit au contact des incitations.",
|
||
"",
|
||
"**[ ACCESS GRANTED: INFRAFABRIC RED TEAM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ ACCÈS AUTORISÉ : INFRAFABRIC ÉQUIPE ROUGE ]**",
|
||
"**[ STATUS: OPERATIONAL REALISM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ STATUT : RÉALISME OPÉRATIONNEL ]**",
|
||
"",
|
||
f"## {cover_h1}",
|
||
]
|
||
if cover_h2:
|
||
cover_h2_out = _compact_title(cover_h2, max_chars=90) if style_version == "v2.0" else cover_h2
|
||
out.extend([f"### {cover_h2_out}", ""])
|
||
else:
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"> Shadow dossier (mirror-first)." if not locale.lower().startswith("fr") else "> Dossier de l’ombre (miroir d’abord).",
|
||
">",
|
||
"> Protocol: IF.DAVE.v1.2" if not locale.lower().startswith("fr") else "> Protocole : IF.DAVE.v1.2",
|
||
f"> Source: `{source_basename}`" if not locale.lower().startswith("fr") else f"> Source : `{source_basename}`",
|
||
f"> Generated: `{today}`" if not locale.lower().startswith("fr") else f"> Généré le : `{today}`",
|
||
f"> Source Hash (sha256): `{source_file_sha}`"
|
||
if not locale.lower().startswith("fr")
|
||
else f"> Empreinte source (sha256) : `{source_file_sha}`",
|
||
f"> Extract Hash (sha256): `{extract_sha}`"
|
||
if not locale.lower().startswith("fr")
|
||
else f"> Empreinte d’extraction (sha256) : `{extract_sha}`",
|
||
"",
|
||
]
|
||
)
|
||
|
||
for section in sections[1:]:
|
||
if section.title.strip().upper() == "INTRODUCTION":
|
||
out.append(_render_intro(section, ctx=ctx))
|
||
else:
|
||
out.append(_render_section(section, ctx=ctx))
|
||
out.append("")
|
||
|
||
if action_pack:
|
||
out.append(_render_action_pack(sections[1:]))
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"---",
|
||
"",
|
||
"*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers**, part of the **InfraFabric.io governance stack**: https://infrafabric.io"
|
||
if not locale.lower().startswith("fr")
|
||
else "*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers**, partie de la **pile de gouvernance InfraFabric.io** : https://infrafabric.io",
|
||
"*Standard Dave Footer:* This document is intended for the recipient only. If you are not the recipient, please delete it and forget you saw anything. P.S. Please consider the environment before printing this email."
|
||
if not locale.lower().startswith("fr")
|
||
else "*Standard Dave Footer:* Ce document est destiné au seul destinataire. Si vous n’êtes pas le destinataire, veuillez le supprimer et oublier que vous l’avez vu. P.S. Veuillez considérer l’environnement avant d’imprimer ce document.",
|
||
]
|
||
)
|
||
|
||
return "\n".join(out).strip() + "\n"
|
||
|
||
|
||
def _infer_vertical_line(*, normalized_text: str, source_basename: str, locale: str) -> str | None:
|
||
text = f"{source_basename}\n{normalized_text}".lower()
|
||
if "ai code guardrails" in text or ("guardrails" in text and "code" in text):
|
||
if locale.lower().startswith("fr"):
|
||
return "> Vertical : déploiement DevSecOps — vents contraires : dérive de politique et exceptions qui deviennent la norme."
|
||
return "> Vertical: DevSecOps rollout — headwinds: policy drift and exception creep."
|
||
if "owasp" in text and "llm" in text:
|
||
if locale.lower().startswith("fr"):
|
||
return "> Vertical : application LLM — vents contraires : injection de prompts et dette de contrôle."
|
||
return "> Vertical: LLM application security — headwinds: prompt injection and control debt."
|
||
return None
|
||
|
||
|
||
_WEEK_DAY_NAME: dict[str, str] = {
|
||
"mon": "Monday",
|
||
"tue": "Tuesday",
|
||
"wed": "Wednesday",
|
||
"thu": "Thursday",
|
||
"fri": "Friday",
|
||
"sat": "Saturday",
|
||
"sun": "Sunday",
|
||
}
|
||
|
||
|
||
def _week_day_name_from_source_basename(source_basename: str) -> str | None:
|
||
"""
|
||
Week packs use predictable day filenames (mon.pdf, tue.pdf, ...).
|
||
When present, we can produce TV-show friendly headings without guessing.
|
||
"""
|
||
|
||
try:
|
||
p = Path(source_basename)
|
||
except Exception:
|
||
return None
|
||
if p.suffix.lower() != ".pdf":
|
||
return None
|
||
key = p.stem.strip().lower()
|
||
return _WEEK_DAY_NAME.get(key)
|
||
|
||
|
||
def _infer_source_company_name(*, normalized_text: str) -> str:
|
||
"""
|
||
Best-effort brand inference for week-pack sources.
|
||
Prefer explicit vendor/org names found in the source text. Fall back to "the vendor".
|
||
"""
|
||
|
||
t = (normalized_text or "").lower()
|
||
if "snyk" in t:
|
||
return "Snyk"
|
||
if "owasp" in t:
|
||
return "OWASP"
|
||
if "palo alto" in t or "cortex xsiam" in t:
|
||
return "Palo Alto Networks"
|
||
if "sentinelone" in t or "singularity" in t:
|
||
return "SentinelOne"
|
||
if "yubico" in t or "yubikey" in t:
|
||
return "Yubico"
|
||
if "vanta" in t:
|
||
return "Vanta"
|
||
if "nist" in t:
|
||
return "NIST"
|
||
return "the vendor"
|
||
|
||
|
||
def _infer_report_short_title(*, normalized_text: str, company: str) -> str:
|
||
"""
|
||
Pull a short, human-readable title from the cover/first page.
|
||
This is used for the TV-show heading line (Company | Report).
|
||
"""
|
||
|
||
pages = _parse_pages(normalized_text)
|
||
first_page = ""
|
||
for _no, body in pages:
|
||
if (body or "").strip():
|
||
first_page = body
|
||
break
|
||
if not first_page:
|
||
first_page = (normalized_text or "")[:2400]
|
||
|
||
stopwords = {
|
||
"introduction",
|
||
"table of contents",
|
||
"contents",
|
||
"overview",
|
||
"abstract",
|
||
"license and usage",
|
||
"revision history",
|
||
}
|
||
title_lines: list[str] = []
|
||
for raw in first_page.splitlines():
|
||
ln = raw.strip()
|
||
if not ln:
|
||
continue
|
||
if ln.startswith("====="):
|
||
continue
|
||
lower = ln.lower()
|
||
if lower in stopwords:
|
||
break
|
||
if company and lower == company.lower():
|
||
continue
|
||
title_lines.append(ln)
|
||
if len(title_lines) >= 4:
|
||
break
|
||
|
||
title = " ".join(title_lines).strip()
|
||
title = re.sub(r"\\s+", " ", title).strip()
|
||
title = title.strip(" -:—")
|
||
# Keep it short; this line is a header, not a paragraph.
|
||
if len(title) > 72:
|
||
title = _compact_title(title, max_chars=72)
|
||
return title or "Source document"
|
||
|
||
|
||
def _render_time_journalist_intro(
|
||
*,
|
||
company: str,
|
||
report_title: str,
|
||
section_titles: list[str],
|
||
locale: str,
|
||
) -> list[str]:
|
||
if locale.lower().startswith("fr"):
|
||
# v2.2+ EN-first: avoid partial FR translation drift until we have a dedicated FR bible for this.
|
||
return []
|
||
|
||
topics = [t for t in section_titles if t and not t.strip().upper().startswith("ANNEX")]
|
||
topics = topics[:3]
|
||
if topics:
|
||
topics_line = "; ".join(f"**{_compact_title(t, max_chars=48)}**" for t in topics)
|
||
summary = f"In plain terms, the source walks through: {topics_line}."
|
||
else:
|
||
summary = "In plain terms, the source is a rollout guide with controls, caveats, and optimism."
|
||
|
||
return [
|
||
f"We have a soft spot for {company}. Today we’re covering **{report_title}**.",
|
||
summary,
|
||
"",
|
||
"How to read this dossier:",
|
||
"- You do not need the PDF open; we quote the source as we go.",
|
||
"- We add short Red Team notes where incentives turn controls into theater.",
|
||
"- If you want the receipts, start with the source hash and follow the links in the pack/trace.",
|
||
"",
|
||
"OK. Let’s dig.",
|
||
]
|
||
|
||
|
||
def _extract_annex_mermaids(md: str) -> list[tuple[str, str]]:
|
||
"""
|
||
Return [(diagram_name, mermaid_code)] from the v2.x Annex section.
|
||
"""
|
||
|
||
diagrams: list[tuple[str, str]] = []
|
||
lines = md.splitlines()
|
||
in_annex = False
|
||
i = 0
|
||
while i < len(lines):
|
||
ln = lines[i].strip()
|
||
if ln.startswith("## Annex"):
|
||
in_annex = True
|
||
i += 1
|
||
continue
|
||
if in_annex and ln.startswith("## ") and not ln.startswith("## Annex"):
|
||
# Past annex
|
||
break
|
||
if in_annex and ln.startswith("#### "):
|
||
name = ln[5:].strip()
|
||
# seek code fence
|
||
j = i + 1
|
||
while j < len(lines) and lines[j].strip() != "```mermaid":
|
||
if lines[j].strip().startswith("#### "):
|
||
break
|
||
j += 1
|
||
if j >= len(lines) or lines[j].strip() != "```mermaid":
|
||
i += 1
|
||
continue
|
||
k = j + 1
|
||
while k < len(lines) and lines[k].strip() != "```":
|
||
k += 1
|
||
code = "\n".join(lines[j + 1 : k]).strip()
|
||
if code:
|
||
diagrams.append((name, code))
|
||
i = k + 1
|
||
continue
|
||
i += 1
|
||
return diagrams
|
||
|
||
|
||
def _describe_mermaid_for_audio(*, name: str, code: str) -> str:
|
||
"""
|
||
Minimal, robust narration: describe the node sequence and call out any explicit decision points.
|
||
"""
|
||
|
||
code_lines = [ln.rstrip() for ln in (code or "").splitlines() if ln.strip()]
|
||
first = (code_lines[0] if code_lines else "").strip().lower()
|
||
|
||
# Build id->label map for flowcharts/state-ish diagrams.
|
||
id_to_label: dict[str, str] = {}
|
||
for ln in code_lines:
|
||
m = re.match(r"^\\s*([A-Za-z0-9_]+)\\s*\\[\\\"(.+?)\\\"\\]\\s*$", ln)
|
||
if m:
|
||
id_to_label[m.group(1)] = m.group(2)
|
||
continue
|
||
m = re.match(r"^\\s*([A-Za-z0-9_]+)\\s*\\[(.+?)\\]\\s*$", ln)
|
||
if m:
|
||
id_to_label[m.group(1)] = m.group(2)
|
||
continue
|
||
m = re.match(r"^\\s*([A-Za-z0-9_]+)\\s*\\{(.+?)\\}\\s*$", ln)
|
||
if m:
|
||
id_to_label[m.group(1)] = m.group(2)
|
||
continue
|
||
|
||
def label(node_id: str) -> str:
|
||
return (id_to_label.get(node_id) or node_id).strip().strip('"').strip()
|
||
|
||
if first.startswith("statediagram"):
|
||
transitions: list[tuple[str, str, str]] = []
|
||
for ln in code_lines:
|
||
m = re.match(r"^\\s*([A-Za-z0-9_\\*\\[\\]]+)\\s*-->\\s*([A-Za-z0-9_\\*\\[\\]]+)(?:\\s*:\\s*(.+))?$", ln)
|
||
if not m:
|
||
continue
|
||
transitions.append((m.group(1).strip(), m.group(2).strip(), (m.group(3) or "").strip()))
|
||
if not transitions:
|
||
return "State diagram showing a lifecycle that mostly loops in place."
|
||
parts = []
|
||
for a, b, note in transitions[:8]:
|
||
if note:
|
||
parts.append(f"{a} to {b} ({note})")
|
||
else:
|
||
parts.append(f"{a} to {b}")
|
||
return "State transitions: " + "; ".join(parts) + "."
|
||
|
||
# Default: flowchart-ish narration.
|
||
edges: list[tuple[str, str]] = []
|
||
decisions: list[str] = []
|
||
for ln in code_lines:
|
||
m = re.match(r"^\\s*([A-Za-z0-9_]+)\\s*-->\\s*\\|[^|]+\\|\\s*([A-Za-z0-9_]+)\\s*$", ln)
|
||
if m:
|
||
edges.append((m.group(1), m.group(2)))
|
||
continue
|
||
m = re.match(r"^\\s*([A-Za-z0-9_]+)\\s*-->\\s*([A-Za-z0-9_]+)\\s*$", ln)
|
||
if m:
|
||
edges.append((m.group(1), m.group(2)))
|
||
continue
|
||
m = re.match(r"^\\s*([A-Za-z0-9_]+)\\{(.+?)\\}\\s*$", ln)
|
||
if m:
|
||
decisions.append(m.group(2).strip())
|
||
|
||
ordered: list[str] = []
|
||
for a, b in edges:
|
||
if a not in ordered:
|
||
ordered.append(a)
|
||
if b not in ordered:
|
||
ordered.append(b)
|
||
|
||
if not ordered:
|
||
return "Flow chart showing a control workflow with loops and exception handling."
|
||
|
||
labels = [label(n) for n in ordered[:10]]
|
||
seq = " -> ".join(labels)
|
||
if decisions:
|
||
return f"Flow: {seq}. Decision points include: {', '.join(decisions[:2])}."
|
||
return f"Flow: {seq}."
|
||
|
||
|
||
def _insert_podcast_script_v2_2(
|
||
*,
|
||
md: str,
|
||
company: str,
|
||
report_title: str,
|
||
source_doc_url: str,
|
||
locale: str,
|
||
) -> str:
|
||
if locale.lower().startswith("fr"):
|
||
return md
|
||
|
||
diagrams = _extract_annex_mermaids(md)
|
||
diagram_lines: list[str] = []
|
||
for name, code in diagrams:
|
||
diagram_lines.extend([f"- {name}: {_describe_mermaid_for_audio(name=name, code=code)}"])
|
||
|
||
script: list[str] = [
|
||
"## Podcast Script (plain text)",
|
||
"",
|
||
"```text",
|
||
f"Today’s declassified dossier covers {company} — {report_title}.",
|
||
"",
|
||
"You do not need the source PDF open. This dossier quotes the source as it goes, then adds the Red Team lens: where the control turns into theater, and what has to be true for it to hold.",
|
||
"",
|
||
"If you want to verify later, use the source link and hash in the header.",
|
||
]
|
||
if source_doc_url:
|
||
script.extend(["", f"Source PDF: {source_doc_url}"])
|
||
|
||
script.extend(
|
||
[
|
||
"",
|
||
"We will cover the mirrored sections, then summarize the gates and stop conditions in the Action Pack.",
|
||
"",
|
||
"Diagram walkthrough (describe aloud):",
|
||
]
|
||
)
|
||
if diagram_lines:
|
||
script.extend(diagram_lines)
|
||
else:
|
||
script.append("- (No diagrams found.)")
|
||
|
||
script.extend(
|
||
[
|
||
"",
|
||
"Close:",
|
||
"If the calendar is the deliverable, then the risk is already in production — and the evidence is still in phase two.",
|
||
"```",
|
||
"",
|
||
]
|
||
)
|
||
|
||
# Insert right before the footer separator (last '---' outside fences).
|
||
lines = md.splitlines()
|
||
footer_sep_idx = None
|
||
in_fence = False
|
||
for idx, ln in enumerate(lines):
|
||
stripped = ln.strip()
|
||
if stripped.startswith("```"):
|
||
in_fence = not in_fence
|
||
continue
|
||
if in_fence:
|
||
continue
|
||
if stripped == "---":
|
||
footer_sep_idx = idx
|
||
if footer_sep_idx is None:
|
||
footer_sep_idx = len(lines)
|
||
|
||
out_lines = lines[:footer_sep_idx] + [""] + script + lines[footer_sep_idx:]
|
||
return "\n".join([ln.rstrip() for ln in out_lines]).strip() + "\n"
|
||
|
||
|
||
def _generate_dave_v1_3_mirror(*, source_text: str, source_path: str, action_pack: bool, locale: str) -> str:
|
||
today = _dt.date.today().isoformat()
|
||
normalized = _normalize_ocr(source_text)
|
||
extract_sha = _sha256_text(normalized)
|
||
source_file_sha = _sha256_file(source_path) if Path(source_path).exists() else "unknown"
|
||
ctx = _RenderContext(seed=extract_sha, locale=locale, voice="v1.3")
|
||
style_version = ctx.voice
|
||
|
||
action_pack = bool(action_pack) or _truthy_env("REVOICE_ACTION_PACK")
|
||
|
||
sections = _extract_sections(normalized)
|
||
if not sections:
|
||
raise ValueError("No content extracted from source")
|
||
|
||
cover_lines = [ln.strip() for ln in sections[0].body.splitlines() if ln.strip() and ln.strip().lower() != "snyk"]
|
||
cover_h1 = sections[0].title.strip() or ("DOSSIER DE L’OMBRE" if locale.lower().startswith("fr") else "SHADOW DOSSIER")
|
||
cover_h2 = " ".join(cover_lines[:2]).strip() if cover_lines else ""
|
||
|
||
y, m, d = today.split("-")
|
||
report_id = f"IF-RT-DAVE-{y}-{m}{d}"
|
||
source_basename = Path(source_path).name
|
||
project_slug = _slugify(Path(source_basename).stem + "-mirror")
|
||
source_slug = _slugify(source_basename)
|
||
filename_title = Path(source_basename).stem.replace("-", " ").replace("_", " ").strip()
|
||
if not filename_title:
|
||
filename_title = source_basename
|
||
|
||
if (
|
||
not cover_h1
|
||
or cover_h1.upper() == "COUVERTURE"
|
||
or _looks_like_site_footer(cover_h1)
|
||
or len(cover_h1) > 96
|
||
or "." in cover_h1
|
||
):
|
||
cover_h1 = filename_title
|
||
|
||
vertical_line = _infer_vertical_line(normalized_text=normalized, source_basename=source_basename, locale=locale)
|
||
|
||
out: list[str] = [
|
||
"---",
|
||
"BRAND: InfraFabric.io",
|
||
"UNIT: RED TEAM (STRATEGIC OPS)" if not locale.lower().startswith("fr") else "UNIT: RED TEAM (OPÉRATIONS STRATÉGIQUES)",
|
||
"DOCUMENT: SHADOW DOSSIER" if not locale.lower().startswith("fr") else "DOCUMENT: DOSSIER DE L’OMBRE",
|
||
"CLASSIFICATION: EYES ONLY // DAVE" if not locale.lower().startswith("fr") else "CLASSIFICATION: CONFIDENTIEL // DAVE",
|
||
"---",
|
||
"",
|
||
"# [ RED TEAM DECLASSIFIED ]" if not locale.lower().startswith("fr") else "# [ DÉCLASSIFIÉ – ÉQUIPE ROUGE ]",
|
||
f"## PROJECT: {project_slug}" if not locale.lower().startswith("fr") else f"## PROJET : {project_slug}",
|
||
f"### SOURCE: {source_slug}" if not locale.lower().startswith("fr") else f"### SOURCE : {source_slug}",
|
||
f"**INFRAFABRIC REPORT ID:** `{report_id}`" if not locale.lower().startswith("fr") else f"**ID DE RAPPORT INFRAFABRIC :** `{report_id}`",
|
||
"",
|
||
"> NOTICE: This document is a product of InfraFabric Red Team."
|
||
if not locale.lower().startswith("fr")
|
||
else "> AVIS : ce document est un produit de l’InfraFabric Red Team.",
|
||
"> It provides socio-technical friction analysis for how a rollout survives contact with incentives."
|
||
if not locale.lower().startswith("fr")
|
||
else "> Il fournit une analyse socio-technique des frictions : ce qui survit au contact des incitations.",
|
||
]
|
||
if vertical_line:
|
||
out.extend([vertical_line])
|
||
out.extend(
|
||
[
|
||
"",
|
||
"**[ ACCESS GRANTED: INFRAFABRIC RED TEAM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ ACCÈS AUTORISÉ : INFRAFABRIC ÉQUIPE ROUGE ]**",
|
||
"**[ STATUS: OPERATIONAL REALISM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ STATUT : RÉALISME OPÉRATIONNEL ]**",
|
||
"",
|
||
f"## {cover_h1}",
|
||
]
|
||
)
|
||
cover_h2_out = ""
|
||
if cover_h2:
|
||
if style_version == "v2.0":
|
||
# Avoid rendering body fragments / author blocks as a subtitle.
|
||
if not _looks_like_cover_subtitle_noise(cover_h2):
|
||
cover_h2_out = _compact_title(cover_h2, max_chars=90)
|
||
else:
|
||
cover_h2_out = cover_h2
|
||
if cover_h2_out:
|
||
out.extend([f"### {cover_h2_out}", ""])
|
||
else:
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"> Shadow dossier (mirror-first)." if not locale.lower().startswith("fr") else "> Dossier de l’ombre (miroir d’abord).",
|
||
">",
|
||
"> Protocol: IF.DAVE.v1.3" if not locale.lower().startswith("fr") else "> Protocole : IF.DAVE.v1.3",
|
||
f"> Source: `{source_basename}`" if not locale.lower().startswith("fr") else f"> Source : `{source_basename}`",
|
||
f"> Generated: `{today}`" if not locale.lower().startswith("fr") else f"> Généré le : `{today}`",
|
||
f"> Source Hash (sha256): `{source_file_sha}`"
|
||
if not locale.lower().startswith("fr")
|
||
else f"> Empreinte source (sha256) : `{source_file_sha}`",
|
||
"",
|
||
]
|
||
)
|
||
|
||
for section in sections[1:]:
|
||
if section.title.strip().upper() == "INTRODUCTION":
|
||
out.append(_render_intro(section, ctx=ctx))
|
||
else:
|
||
out.append(_render_section(section, ctx=ctx))
|
||
out.append("")
|
||
|
||
if action_pack:
|
||
out.append(_render_action_pack(sections[1:]))
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"---",
|
||
"",
|
||
"*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** for socio-technical friction analysis: https://infrafabric.io"
|
||
if not locale.lower().startswith("fr")
|
||
else "*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** (analyse socio-technique des frictions) : https://infrafabric.io",
|
||
"*Standard Dave Footer:* This document is intended for the recipient only. If you are not the recipient, please delete it and forget you saw anything. P.S. Please consider the environment before printing this email."
|
||
if not locale.lower().startswith("fr")
|
||
else "*Standard Dave Footer:* Ce document est destiné au seul destinataire. Si vous n’êtes pas le destinataire, veuillez le supprimer et oublier que vous l’avez vu. P.S. Veuillez considérer l’environnement avant d’imprimer ce document.",
|
||
]
|
||
)
|
||
|
||
return "\n".join(out).strip() + "\n"
|
||
|
||
|
||
def _generate_dave_v1_6_mirror(*, source_text: str, source_path: str, action_pack: bool, locale: str) -> str:
|
||
today = _dt.date.today().isoformat()
|
||
normalized = _normalize_ocr(source_text)
|
||
extract_sha = _sha256_text(normalized)
|
||
source_file_sha = _sha256_file(source_path) if Path(source_path).exists() else "unknown"
|
||
ctx = _RenderContext(seed=extract_sha, locale=locale, voice="v1.6")
|
||
style_version = ctx.voice
|
||
|
||
action_pack = bool(action_pack) or _truthy_env("REVOICE_ACTION_PACK")
|
||
|
||
sections = _extract_sections(normalized)
|
||
if not sections:
|
||
raise ValueError("No content extracted from source")
|
||
|
||
cover_lines = [ln.strip() for ln in sections[0].body.splitlines() if ln.strip() and ln.strip().lower() != "snyk"]
|
||
cover_h1 = sections[0].title.strip() or ("DOSSIER DE L’OMBRE" if locale.lower().startswith("fr") else "SHADOW DOSSIER")
|
||
cover_h2 = " ".join(cover_lines[:2]).strip() if cover_lines else ""
|
||
|
||
y, m, d = today.split("-")
|
||
report_id = f"IF-RT-DAVE-{y}-{m}{d}"
|
||
source_basename = Path(source_path).name
|
||
project_slug = _slugify(Path(source_basename).stem + "-mirror")
|
||
source_slug = _slugify(source_basename)
|
||
filename_title = Path(source_basename).stem.replace("-", " ").replace("_", " ").strip()
|
||
if not filename_title:
|
||
filename_title = source_basename
|
||
|
||
if (
|
||
not cover_h1
|
||
or cover_h1.upper() == "COUVERTURE"
|
||
or _looks_like_site_footer(cover_h1)
|
||
or len(cover_h1) > 96
|
||
or "." in cover_h1
|
||
):
|
||
cover_h1 = filename_title
|
||
|
||
vertical_line = _infer_vertical_line(normalized_text=normalized, source_basename=source_basename, locale=locale)
|
||
|
||
out: list[str] = [
|
||
"---",
|
||
"BRAND: InfraFabric.io",
|
||
"UNIT: RED TEAM (STRATEGIC OPS)" if not locale.lower().startswith("fr") else "UNIT: RED TEAM (OPÉRATIONS STRATÉGIQUES)",
|
||
"DOCUMENT: SHADOW DOSSIER" if not locale.lower().startswith("fr") else "DOCUMENT: DOSSIER DE L’OMBRE",
|
||
"CLASSIFICATION: EYES ONLY // DAVE" if not locale.lower().startswith("fr") else "CLASSIFICATION: CONFIDENTIEL // DAVE",
|
||
"---",
|
||
"",
|
||
"# [ RED TEAM DECLASSIFIED ]" if not locale.lower().startswith("fr") else "# [ DÉCLASSIFIÉ – ÉQUIPE ROUGE ]",
|
||
f"## PROJECT: {project_slug}" if not locale.lower().startswith("fr") else f"## PROJET : {project_slug}",
|
||
f"### SOURCE: {source_slug}" if not locale.lower().startswith("fr") else f"### SOURCE : {source_slug}",
|
||
f"**INFRAFABRIC REPORT ID:** `{report_id}`" if not locale.lower().startswith("fr") else f"**ID DE RAPPORT INFRAFABRIC :** `{report_id}`",
|
||
"",
|
||
"> NOTICE: This document is a product of InfraFabric Red Team."
|
||
if not locale.lower().startswith("fr")
|
||
else "> AVIS : ce document est un produit de l’InfraFabric Red Team.",
|
||
"> It exposes socio-technical frictions where incentives turn controls into theater."
|
||
if not locale.lower().startswith("fr")
|
||
else "> Il expose les frictions socio-techniques : là où les incitations transforment les contrôles en théâtre.",
|
||
]
|
||
if vertical_line:
|
||
out.extend([vertical_line])
|
||
out.extend(
|
||
[
|
||
"",
|
||
"**[ ACCESS GRANTED: INFRAFABRIC RED TEAM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ ACCÈS AUTORISÉ : INFRAFABRIC ÉQUIPE ROUGE ]**",
|
||
"**[ STATUS: OPERATIONAL REALISM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ STATUT : RÉALISME OPÉRATIONNEL ]**",
|
||
"",
|
||
f"## {cover_h1}",
|
||
]
|
||
)
|
||
cover_h2_out = ""
|
||
if cover_h2:
|
||
if style_version == "v2.0":
|
||
if not _looks_like_cover_subtitle_noise(cover_h2):
|
||
cover_h2_out = _compact_title(cover_h2, max_chars=90)
|
||
else:
|
||
cover_h2_out = cover_h2
|
||
if cover_h2_out:
|
||
out.extend([f"### {cover_h2_out}", ""])
|
||
else:
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"> Shadow dossier (mirror-first)." if not locale.lower().startswith("fr") else "> Dossier de l’ombre (miroir d’abord).",
|
||
">",
|
||
"> Protocol: IF.DAVE.v1.6" if not locale.lower().startswith("fr") else "> Protocole : IF.DAVE.v1.6",
|
||
f"> Source: `{source_basename}`" if not locale.lower().startswith("fr") else f"> Source : `{source_basename}`",
|
||
f"> Generated: `{today}`" if not locale.lower().startswith("fr") else f"> Généré le : `{today}`",
|
||
f"> Source Hash (sha256): `{source_file_sha}`"
|
||
if not locale.lower().startswith("fr")
|
||
else f"> Empreinte source (sha256) : `{source_file_sha}`",
|
||
"",
|
||
]
|
||
)
|
||
|
||
for section in sections[1:]:
|
||
if section.title.strip().upper() == "INTRODUCTION":
|
||
out.append(_render_intro(section, ctx=ctx))
|
||
else:
|
||
out.append(_render_section(section, ctx=ctx))
|
||
out.append("")
|
||
|
||
if action_pack:
|
||
out.append(_render_action_pack(sections[1:]))
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"---",
|
||
"",
|
||
"*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** for socio-technical friction analysis: https://infrafabric.io"
|
||
if not locale.lower().startswith("fr")
|
||
else "*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** (analyse socio-technique des frictions) : https://infrafabric.io",
|
||
"*Standard Dave Footer:* This document is intended for the recipient only. If you are not the recipient, please delete it and forget you saw anything. P.S. Please consider the environment before printing this email."
|
||
if not locale.lower().startswith("fr")
|
||
else "*Standard Dave Footer:* Ce document est destiné au seul destinataire. Si vous n’êtes pas le destinataire, veuillez le supprimer et oublier que vous l’avez vu. P.S. Veuillez considérer l’environnement avant d’imprimer ce document.",
|
||
]
|
||
)
|
||
|
||
return "\n".join(out).strip() + "\n"
|
||
|
||
|
||
def _generate_dave_v1_7_mirror(*, source_text: str, source_path: str, action_pack: bool, locale: str) -> str:
|
||
today = _dt.date.today().isoformat()
|
||
normalized = _normalize_ocr(source_text)
|
||
extract_sha = _sha256_text(normalized)
|
||
source_file_sha = _sha256_file(source_path) if Path(source_path).exists() else "unknown"
|
||
ctx = _RenderContext(seed=extract_sha, locale=locale, voice="v1.7")
|
||
|
||
action_pack = bool(action_pack) or _truthy_env("REVOICE_ACTION_PACK")
|
||
|
||
sections = _extract_sections(normalized)
|
||
if not sections:
|
||
raise ValueError("No content extracted from source")
|
||
if len(sections) == 1:
|
||
# Some sources (notably HTML→Markdown mirrors) do not have reliable in-document structure.
|
||
# Keep the output reviewable by forcing a (cover + body) shape so downstream rendering,
|
||
# Action Pack generation, and per-section critique still work.
|
||
only = sections[0]
|
||
sections = [
|
||
_SourceSection(title=only.title, body="", why_it_matters=None),
|
||
_SourceSection(title="Overview" if not locale.lower().startswith("fr") else "Aperçu", body=only.body, why_it_matters=None),
|
||
]
|
||
|
||
cover_lines = [ln.strip() for ln in sections[0].body.splitlines() if ln.strip() and ln.strip().lower() != "snyk"]
|
||
cover_h1 = sections[0].title.strip() or ("DOSSIER DE L’OMBRE" if locale.lower().startswith("fr") else "SHADOW DOSSIER")
|
||
cover_h2 = " ".join(cover_lines[:2]).strip() if cover_lines else ""
|
||
|
||
y, m, d = today.split("-")
|
||
report_id = f"IF-RT-DAVE-{y}-{m}{d}"
|
||
source_basename = Path(source_path).name
|
||
project_slug = _slugify(Path(source_basename).stem + "-mirror")
|
||
source_slug = _slugify(source_basename)
|
||
filename_title = Path(source_basename).stem.replace("-", " ").replace("_", " ").strip()
|
||
if not filename_title:
|
||
filename_title = source_basename
|
||
|
||
if (
|
||
not cover_h1
|
||
or cover_h1.upper() == "COUVERTURE"
|
||
or _looks_like_site_footer(cover_h1)
|
||
or len(cover_h1) > 96
|
||
or "." in cover_h1
|
||
):
|
||
cover_h1 = filename_title
|
||
|
||
vertical_line = _infer_vertical_line(normalized_text=normalized, source_basename=source_basename, locale=locale)
|
||
|
||
out: list[str] = [
|
||
"---",
|
||
"BRAND: InfraFabric.io",
|
||
"UNIT: RED TEAM (STRATEGIC OPS)" if not locale.lower().startswith("fr") else "UNIT: RED TEAM (OPÉRATIONS STRATÉGIQUES)",
|
||
"DOCUMENT: SHADOW DOSSIER" if not locale.lower().startswith("fr") else "DOCUMENT: DOSSIER DE L’OMBRE",
|
||
"CLASSIFICATION: EYES ONLY // DAVE" if not locale.lower().startswith("fr") else "CLASSIFICATION: CONFIDENTIEL // DAVE",
|
||
"---",
|
||
"",
|
||
"# [ RED TEAM DECLASSIFIED ]" if not locale.lower().startswith("fr") else "# [ DÉCLASSIFIÉ – ÉQUIPE ROUGE ]",
|
||
f"## PROJECT: {project_slug}" if not locale.lower().startswith("fr") else f"## PROJET : {project_slug}",
|
||
f"### SOURCE: {source_slug}" if not locale.lower().startswith("fr") else f"### SOURCE : {source_slug}",
|
||
f"**INFRAFABRIC REPORT ID:** `{report_id}`" if not locale.lower().startswith("fr") else f"**ID DE RAPPORT INFRAFABRIC :** `{report_id}`",
|
||
"",
|
||
"> NOTICE: This document is a product of InfraFabric Red Team."
|
||
if not locale.lower().startswith("fr")
|
||
else "> AVIS : ce document est un produit de l’InfraFabric Red Team.",
|
||
"> It exposes socio-technical frictions where incentives turn controls into theater."
|
||
if not locale.lower().startswith("fr")
|
||
else "> Il expose les frictions socio-techniques : là où les incitations transforment les contrôles en théâtre.",
|
||
]
|
||
if vertical_line:
|
||
out.extend([vertical_line])
|
||
out.extend(
|
||
[
|
||
"",
|
||
"**[ ACCESS GRANTED: INFRAFABRIC RED TEAM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ ACCÈS AUTORISÉ : INFRAFABRIC ÉQUIPE ROUGE ]**",
|
||
"**[ STATUS: OPERATIONAL REALISM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ STATUT : RÉALISME OPÉRATIONNEL ]**",
|
||
"",
|
||
f"## {cover_h1}",
|
||
]
|
||
)
|
||
cover_h2_out = ""
|
||
if cover_h2:
|
||
if style_version == "v2.0":
|
||
if not _looks_like_cover_subtitle_noise(cover_h2):
|
||
cover_h2_out = _compact_title(cover_h2, max_chars=90)
|
||
else:
|
||
cover_h2_out = cover_h2
|
||
if cover_h2_out:
|
||
out.extend([f"### {cover_h2_out}", ""])
|
||
else:
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"> Shadow dossier (mirror-first)." if not locale.lower().startswith("fr") else "> Dossier de l’ombre (miroir d’abord).",
|
||
">",
|
||
"> Protocol: IF.DAVE.v1.7" if not locale.lower().startswith("fr") else "> Protocole : IF.DAVE.v1.7",
|
||
f"> Source: `{source_basename}`" if not locale.lower().startswith("fr") else f"> Source : `{source_basename}`",
|
||
f"> Generated: `{today}`" if not locale.lower().startswith("fr") else f"> Généré le : `{today}`",
|
||
f"> Source Hash (sha256): `{source_file_sha}`"
|
||
if not locale.lower().startswith("fr")
|
||
else f"> Empreinte source (sha256) : `{source_file_sha}`",
|
||
"",
|
||
]
|
||
)
|
||
|
||
for section in sections[1:]:
|
||
if section.title.strip().upper() == "INTRODUCTION":
|
||
out.append(_render_intro(section, ctx=ctx))
|
||
else:
|
||
out.append(_render_section(section, ctx=ctx))
|
||
out.append("")
|
||
|
||
if action_pack:
|
||
out.append(_render_action_pack(sections[1:]))
|
||
out.append("")
|
||
|
||
# v1.7 requires >=2 Mermaid diagrams; some short sources may not naturally produce enough.
|
||
# Add supplemental *inferred* diagrams only when needed, without adding new factual claims.
|
||
if locale.lower().startswith("fr"):
|
||
mermaid_section_title = "## Annexes (diagrammes inférés)"
|
||
mermaid_note = "_Diagrammes inférés : synthèse InfraFabric Red Team (sans nouvelles affirmations factuelles)._"
|
||
evidence_label = "Boucle de dérive de preuve (inférée)"
|
||
exception_label = "Stase d’exception (inférée)"
|
||
else:
|
||
mermaid_section_title = "## Annex (inferred diagrams)"
|
||
mermaid_note = "_Inferred diagrams: InfraFabric Red Team synthesis (no new factual claims)._"
|
||
evidence_label = "Evidence drift loop (inferred)"
|
||
exception_label = "Exception stasis (inferred)"
|
||
|
||
current_md = "\n".join(out)
|
||
mermaid_count = len(re.findall(r"```mermaid\\b", current_md))
|
||
if mermaid_count < 2:
|
||
out.extend([mermaid_section_title, "", mermaid_note, ""])
|
||
if mermaid_count < 1:
|
||
out.extend(
|
||
[
|
||
f"### {evidence_label}",
|
||
"",
|
||
"```mermaid",
|
||
"flowchart TD",
|
||
" A[Control intent] --> B[Manual evidence requested]",
|
||
" B --> C[Artifact produced]",
|
||
" C --> D[Dashboard goes green]",
|
||
" D --> E[Exceptions accumulate]",
|
||
" E --> F[Definition of \"compliance\" shifts]",
|
||
" F --> B",
|
||
"```",
|
||
"",
|
||
]
|
||
)
|
||
out.extend(
|
||
[
|
||
f"### {exception_label}",
|
||
"",
|
||
"```mermaid",
|
||
"stateDiagram-v2",
|
||
" [*] --> Requested",
|
||
" Requested --> PendingReview: \"needs alignment\"",
|
||
" PendingReview --> PendingReview: renewal",
|
||
" PendingReview --> Approved: silence",
|
||
" Approved --> Approved: \"temporary\" extension",
|
||
"```",
|
||
"",
|
||
]
|
||
)
|
||
|
||
out.extend(
|
||
[
|
||
"---",
|
||
"",
|
||
"*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** for socio-technical friction analysis: https://infrafabric.io"
|
||
if not locale.lower().startswith("fr")
|
||
else "*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** (analyse socio-technique des frictions) : https://infrafabric.io",
|
||
"*Standard Dave Footer:* This document is intended for the recipient only. If you are not the recipient, please delete it and forget you saw anything. P.S. Please consider the environment before printing this email."
|
||
if not locale.lower().startswith("fr")
|
||
else "*Standard Dave Footer:* Ce document est destiné au seul destinataire. Si vous n’êtes pas le destinataire, veuillez le supprimer et oublier que vous l’avez vu. P.S. Veuillez considérer l’environnement avant d’imprimer ce document.",
|
||
]
|
||
)
|
||
|
||
return "\n".join(out).strip() + "\n"
|
||
|
||
|
||
def _extract_claim_lines(*, normalized_text: str, max_items: int = 7) -> list[str]:
|
||
lines = [ln.strip() for ln in normalized_text.splitlines()]
|
||
seen: set[str] = set()
|
||
|
||
def should_join(prev: str, nxt: str) -> bool:
|
||
if not nxt:
|
||
return False
|
||
prev_s = prev.rstrip()
|
||
nxt_s = nxt.lstrip()
|
||
if not prev_s or not nxt_s:
|
||
return False
|
||
# Classic PDF hyphenation: "AI-" + "generated".
|
||
if prev_s.endswith("-") and not prev_s.endswith(" -"):
|
||
return True
|
||
# If the previous line doesn't look sentence-terminated, allow one-line continuation.
|
||
if not prev_s.endswith((".", "!", "?", ":", "…")) and len(prev_s) < 160:
|
||
# Continuation lines often start lower-case, digits, or punctuation.
|
||
if nxt_s[:1].islower() or nxt_s[:1].isdigit() or nxt_s.startswith(("(", "[", "{", "“", "\"", "’", "'")):
|
||
return True
|
||
return False
|
||
|
||
def base_filter(s: str) -> bool:
|
||
if not s or len(s) < 14:
|
||
return False
|
||
lower = s.lower().strip()
|
||
if "http://" in lower or "https://" in lower:
|
||
return False
|
||
if lower in {"markdown content:", "url source:"}:
|
||
return False
|
||
if "trace id" in lower:
|
||
return False
|
||
# Page markers from extract stage.
|
||
if "===== page-" in lower or lower.startswith("====="):
|
||
return False
|
||
# Avoid email-heavy table rows (these belong in the mirrored table, not in the claims register).
|
||
if "@" in s and not any(tok in s for tok in ["%", "$", "€", "£"]):
|
||
return False
|
||
# Skip obvious code/config fragments.
|
||
if any(ch in s for ch in ["{", "}", "```"]):
|
||
return False
|
||
# Avoid pure page numbers.
|
||
if s.isdigit() and len(s) <= 4:
|
||
return False
|
||
return True
|
||
|
||
def score_claim(s: str) -> int:
|
||
lower = s.lower()
|
||
score = 0
|
||
if "%" in s or "$" in s or "€" in s or "£" in s:
|
||
score += 3
|
||
if re.search(r"\b\d{3,}\b", s):
|
||
score += 2
|
||
if re.search(r"\b\d{2,}\b", s):
|
||
score += 1
|
||
if re.search(r"\b\d+\s*(?:ms|sec|secs|seconds|min|mins|minutes|hour|hours|day|days|week|weeks|month|months|year|years|gb|tb|mb|kb|x)\b", lower):
|
||
score += 2
|
||
if any(kw in lower for kw in ["study", "studies", "roi", "increase", "decrease", "reduction", "saves", "save", "cost", "faster", "fidelity"]):
|
||
score += 1
|
||
# Penalties for common low-signal junk.
|
||
if "datasheet" in lower:
|
||
score -= 5
|
||
if "screenshot" in lower and "%" not in s:
|
||
score -= 3
|
||
if "all rights reserved" in lower or "copyright" in lower:
|
||
score -= 3
|
||
if "microsoft.com/devcontainers" in lower or "devcontainers" in lower:
|
||
score -= 4
|
||
if len(s) < 40:
|
||
score -= 1
|
||
return score
|
||
|
||
candidates: list[tuple[int, int, str]] = []
|
||
i = 0
|
||
while i < len(lines):
|
||
ln = lines[i]
|
||
if not base_filter(ln):
|
||
i += 1
|
||
continue
|
||
merged = ln
|
||
# Stitch a small number of continuation lines to avoid truncated claims.
|
||
stitched = 0
|
||
while i + 1 < len(lines) and stitched < 3 and should_join(merged, lines[i + 1]):
|
||
nxt = lines[i + 1]
|
||
if merged.rstrip().endswith("-") and not merged.rstrip().endswith(" -"):
|
||
prefix = merged.rstrip()[:-1]
|
||
last_tok = (prefix.split()[-1] if prefix.split() else "").strip()
|
||
# Heuristic: short tokens like AI-/IDE-/PR- are often meaningful hyphens (keep "-").
|
||
keep_dash = 0 < len(last_tok) <= 3
|
||
merged = (merged.rstrip() + nxt.lstrip()) if keep_dash else (prefix + nxt.lstrip())
|
||
else:
|
||
merged = merged.rstrip() + " " + nxt.lstrip()
|
||
i += 1
|
||
stitched += 1
|
||
merged = " ".join(merged.split()).strip()
|
||
if not merged:
|
||
i += 1
|
||
continue
|
||
if merged.rstrip().endswith("-") and not merged.rstrip().endswith(" -"):
|
||
i += 1
|
||
continue
|
||
if not (re.search(r"\d", merged) or "%" in merged or "$" in merged or "€" in merged or "£" in merged):
|
||
i += 1
|
||
continue
|
||
# Skip obviously broken glyph runs.
|
||
if sum(1 for ch in merged if " " <= ch <= "~") < max(8, int(len(merged) * 0.5)):
|
||
i += 1
|
||
continue
|
||
key = merged.lower()
|
||
if key in seen:
|
||
i += 1
|
||
continue
|
||
seen.add(key)
|
||
s = score_claim(merged)
|
||
if s > 0:
|
||
candidates.append((s, i, merged))
|
||
i += 1
|
||
|
||
# Choose the strongest measurable claims first, stable-tie on appearance order.
|
||
candidates.sort(key=lambda t: (-t[0], t[1]))
|
||
return [c for _, _, c in candidates[:max_items]]
|
||
|
||
|
||
def _looks_like_government_standard(*, normalized_text: str, source_basename: str) -> bool:
|
||
s = f"{source_basename}\n{normalized_text}".lower()
|
||
return any(
|
||
kw in s
|
||
for kw in [
|
||
"nist sp",
|
||
"special publication",
|
||
"800-207",
|
||
"zero trust",
|
||
"nvlpubs.nist.gov",
|
||
]
|
||
)
|
||
|
||
|
||
def _render_translation_table(*, normalized_text: str, locale: str) -> str:
|
||
# Red-team synthesis: only include rows for terms that actually appear in the source text.
|
||
candidates: list[tuple[str, str]] = [
|
||
("Policy Decision Point (PDP)", "Gate: policy evaluation; Stop: deny when policy cannot be evaluated per-request"),
|
||
("Policy Enforcement Point (PEP)", "Gate: enforcement path; Stop: deny when enforcement is bypassable or unaudited"),
|
||
("Continuous diagnostics", "Gate: posture checks; Stop: deny when posture signals are stale/missing"),
|
||
("Least privilege", "Gate: authorization scope; Stop: deny when scopes exceed role baseline"),
|
||
("Micro-segmentation", "Gate: network access; Stop: deny lateral movement outside declared paths"),
|
||
("Implicit trust", "Gate: network admission; Stop: deny if access is granted by location/ownership alone"),
|
||
]
|
||
present: list[tuple[str, str]] = []
|
||
hay = normalized_text.lower()
|
||
for term, mapping in candidates:
|
||
if term.lower().split(" (")[0] in hay or term.lower() in hay:
|
||
present.append((term, mapping))
|
||
|
||
if not present:
|
||
return ""
|
||
|
||
if locale.lower().startswith("fr"):
|
||
title = "## Table de traduction (source → portes de contrôle)"
|
||
note = "_Synthèse InfraFabric Red Team : transformer la prose normative en portes opposables (sans nouvelles affirmations factuelles)._"
|
||
col_a = "Terme (source)"
|
||
col_b = "Traduction opérationnelle (porte)"
|
||
else:
|
||
title = "## Translation Table (source → gates)"
|
||
note = "_InfraFabric Red Team synthesis: translate standard prose into opposable gates (no new factual claims)._"
|
||
col_a = "Source term"
|
||
col_b = "Operational translation (gate)"
|
||
|
||
out = [title, "", note, "", f"| {col_a} | {col_b} |", "| --- | --- |"]
|
||
for term, mapping in present[:12]:
|
||
out.append(f"| {term} | {mapping} |")
|
||
return "\n".join(out).strip()
|
||
|
||
|
||
def _normalize_mermaid_block(text: str) -> str:
|
||
return "\n".join([ln.rstrip() for ln in str(text).strip().splitlines() if ln.strip()])
|
||
|
||
|
||
def _diagram_label(diagram: str, *, locale: str) -> str:
|
||
d = diagram.lower()
|
||
if "pendingreview" in d or "exception" in d:
|
||
return "Stase d’exception" if locale.lower().startswith("fr") else "Exception stasis"
|
||
if "screenshot" in d or "artifact" in d or "evidence" in d or "attestation" in d:
|
||
return "Boucle de dérive de preuve" if locale.lower().startswith("fr") else "Evidence drift loop"
|
||
if "policy decision point" in d or "pdp" in d or "policy enforcement point" in d or "pep" in d:
|
||
return "Chaîne PDP/PEP" if locale.lower().startswith("fr") else "PDP/PEP chain"
|
||
if "rag store" in d or ("llm" in d and "tools" in d):
|
||
return "Architecture LLM + outils" if locale.lower().startswith("fr") else "LLM toolchain architecture"
|
||
if "questionnaire" in d or "vendor" in d or "third-party" in d:
|
||
return "Boucle tiers / fournisseurs" if locale.lower().startswith("fr") else "Third‑party loop"
|
||
return "Boucle de contrôle (inférée)" if locale.lower().startswith("fr") else "Inferred control loop"
|
||
|
||
|
||
def _apply_dave_v2_0_postprocess(md: str, *, locale: str) -> str:
|
||
"""
|
||
IF.DAVE v2.0 hardening pass:
|
||
- Limit Dave Factor callouts to 1–2 per dossier (keep highest-signal sections).
|
||
- Deduplicate Mermaid diagrams: render each unique diagram once in an Annex and reference it in-body.
|
||
"""
|
||
|
||
lines = md.splitlines()
|
||
|
||
# Identify the footer separator (last '---') so we can insert an Annex above it.
|
||
footer_sep_idx = None
|
||
in_fence = False
|
||
for idx, ln in enumerate(lines):
|
||
stripped = ln.strip()
|
||
if stripped.startswith("```"):
|
||
in_fence = not in_fence
|
||
continue
|
||
if in_fence:
|
||
continue
|
||
if stripped == "---":
|
||
footer_sep_idx = idx
|
||
if footer_sep_idx is None:
|
||
footer_sep_idx = len(lines)
|
||
|
||
# 1) Callout budget: find callout blocks and keep up to 2.
|
||
callouts: list[dict] = []
|
||
current_section = ""
|
||
i = 0
|
||
while i < len(lines):
|
||
ln = lines[i]
|
||
stripped = ln.strip()
|
||
if stripped.startswith("## "):
|
||
current_section = stripped[3:].strip()
|
||
is_callout = stripped.startswith("> **The Dave Factor:**") or stripped.startswith("> **Le facteur Dave :**")
|
||
if is_callout:
|
||
start = i
|
||
j = i
|
||
while j < len(lines) and lines[j].strip().startswith(">"):
|
||
j += 1
|
||
block_text = "\n".join(lines[start:j]).strip()
|
||
callouts.append({"start": start, "end": j, "section": current_section, "text": block_text})
|
||
i = j
|
||
continue
|
||
i += 1
|
||
|
||
def score_callout(section_title: str, text: str) -> int:
|
||
s = (section_title or "").upper()
|
||
t = (text or "").lower()
|
||
score = 0
|
||
if any(k in s for k in ["REQUEST EVIDENCE", "ACCESS REQUEST", "LOCAL SECURITY", "REQUEST EVIDENCE"]):
|
||
score += 120
|
||
if "screenshot" in t or "attestation" in t:
|
||
score += 110
|
||
if "audit" in s or "compliance" in s:
|
||
score += 95
|
||
if "training" in s or "quiz" in t or "certificate" in t:
|
||
score += 75
|
||
if any(k in t for k in ["fips", "piv", "fido", "aal"]):
|
||
score += 70
|
||
if "roadmap" in t or "alignment session" in t:
|
||
score += 25
|
||
return score
|
||
|
||
keep_callouts: set[int] = set()
|
||
if callouts:
|
||
ranked = sorted(
|
||
enumerate(callouts),
|
||
key=lambda it: (-score_callout(it[1]["section"], it[1]["text"]), it[1]["start"]),
|
||
)
|
||
seen_hashes: set[str] = set()
|
||
for idx, c in ranked:
|
||
sig_lines: list[str] = []
|
||
for ln in str(c["text"]).splitlines():
|
||
s = ln.strip()
|
||
if s.startswith("> **The Dave Factor:**") or s.startswith("> **Le facteur Dave :**"):
|
||
sig_lines.append(s)
|
||
if s.startswith("> **Countermeasure:**") or s.startswith("> **Contre-mesure :**"):
|
||
sig_lines.append(s)
|
||
if len(sig_lines) >= 2:
|
||
break
|
||
signature = "\n".join(sig_lines).strip() or str(c["text"]).strip()
|
||
h = _sha256_text(signature)
|
||
if h in seen_hashes:
|
||
continue
|
||
keep_callouts.add(idx)
|
||
seen_hashes.add(h)
|
||
if len(keep_callouts) >= 2:
|
||
break
|
||
|
||
# 2) Mermaid dedupe: collect all mermaid code fences, remove in-body blocks and replace with references.
|
||
diagrams: list[dict] = []
|
||
current_section = ""
|
||
in_other_fence = False
|
||
i = 0
|
||
while i < len(lines):
|
||
stripped = lines[i].strip()
|
||
if stripped.startswith("```") and stripped != "```mermaid":
|
||
in_other_fence = not in_other_fence
|
||
i += 1
|
||
continue
|
||
if in_other_fence:
|
||
i += 1
|
||
continue
|
||
if stripped.startswith("## "):
|
||
current_section = stripped[3:].strip()
|
||
i += 1
|
||
continue
|
||
if stripped == "```mermaid":
|
||
start = i
|
||
j = i + 1
|
||
while j < len(lines) and lines[j].strip() != "```":
|
||
j += 1
|
||
end = min(len(lines), j + 1)
|
||
raw = "\n".join(lines[start + 1 : j])
|
||
norm = _normalize_mermaid_block(raw)
|
||
if norm:
|
||
diagrams.append(
|
||
{
|
||
"start": start,
|
||
"end": end,
|
||
"section": current_section,
|
||
"raw": raw.rstrip(),
|
||
"norm": norm,
|
||
}
|
||
)
|
||
i = end
|
||
continue
|
||
i += 1
|
||
|
||
unique_diagrams: list[dict] = []
|
||
seen: set[str] = set()
|
||
for dgm in diagrams:
|
||
if dgm["norm"] in seen:
|
||
continue
|
||
seen.add(dgm["norm"])
|
||
unique_diagrams.append(dgm)
|
||
|
||
# Guarantee at least two diagrams by adding safe inferred defaults if needed.
|
||
def ensure_default_diagram(content: str) -> None:
|
||
norm = _normalize_mermaid_block(content)
|
||
if not norm or norm in seen:
|
||
return
|
||
seen.add(norm)
|
||
unique_diagrams.append({"raw": content.rstrip(), "norm": norm, "section": ""})
|
||
|
||
if len(unique_diagrams) < 2:
|
||
ensure_default_diagram(
|
||
"\n".join(
|
||
[
|
||
"flowchart TD",
|
||
' A["Control intent"] --> B["Manual evidence requested"]',
|
||
' B --> C["Artifact produced"]',
|
||
' C --> D["Dashboard goes green"]',
|
||
' D --> E["Exceptions accumulate"]',
|
||
' E --> F["Definition of compliance shifts"]',
|
||
" F --> B",
|
||
]
|
||
)
|
||
)
|
||
if len(unique_diagrams) < 2:
|
||
ensure_default_diagram(
|
||
"\n".join(
|
||
[
|
||
"stateDiagram-v2",
|
||
" [*] --> Requested",
|
||
' Requested --> PendingReview: \"needs alignment\"',
|
||
" PendingReview --> PendingReview: renewal",
|
||
" PendingReview --> Approved: silence",
|
||
' Approved --> Approved: \"temporary\" extension',
|
||
]
|
||
)
|
||
)
|
||
|
||
# Build stable diagram names.
|
||
label_counts: dict[str, int] = {}
|
||
diagram_names: dict[str, str] = {}
|
||
for dgm in unique_diagrams:
|
||
label = _diagram_label(dgm.get("raw", ""), locale=locale)
|
||
label_counts[label] = label_counts.get(label, 0) + 1
|
||
used_seq: dict[str, int] = {}
|
||
for dgm in unique_diagrams:
|
||
label = _diagram_label(dgm.get("raw", ""), locale=locale)
|
||
used_seq[label] = used_seq.get(label, 0) + 1
|
||
suffix = f" ({used_seq[label]})" if label_counts.get(label, 0) > 1 else ""
|
||
diagram_names[dgm["norm"]] = f"{label}{suffix}"
|
||
|
||
# Rebuild document: remove callouts beyond budget, remove mermaid blocks, and insert Annex before footer.
|
||
remove_ranges: list[tuple[int, int]] = []
|
||
for idx, c in enumerate(callouts):
|
||
if idx not in keep_callouts:
|
||
remove_ranges.append((c["start"], c["end"]))
|
||
for dgm in diagrams:
|
||
# Remove the optional diagram heading directly above inferred diagrams (best effort).
|
||
start = dgm["start"]
|
||
maybe_heading = start - 2
|
||
if maybe_heading >= 0:
|
||
h0 = lines[maybe_heading].strip()
|
||
h1 = lines[maybe_heading + 1].strip() if maybe_heading + 1 < len(lines) else ""
|
||
if h0.startswith("###") and "diagram" in h0.lower() and h1 == "":
|
||
start = maybe_heading
|
||
remove_ranges.append((start, dgm["end"]))
|
||
|
||
# Merge / normalize ranges.
|
||
remove_ranges.sort()
|
||
merged: list[tuple[int, int]] = []
|
||
for start, end in remove_ranges:
|
||
if not merged or start > merged[-1][1]:
|
||
merged.append((start, end))
|
||
else:
|
||
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
|
||
|
||
out_lines: list[str] = []
|
||
i = 0
|
||
range_idx = 0
|
||
referenced: set[str] = set()
|
||
while i < len(lines):
|
||
if range_idx < len(merged) and i == merged[range_idx][0]:
|
||
end = merged[range_idx][1]
|
||
# If this range was a diagram, replace with a reference line (based on the diagram norm if we can find it).
|
||
# Best effort: find the mermaid start inside this range.
|
||
ref = None
|
||
for dgm in diagrams:
|
||
if dgm["start"] >= merged[range_idx][0] and dgm["end"] <= merged[range_idx][1]:
|
||
name = diagram_names.get(dgm["norm"])
|
||
if name:
|
||
if dgm["norm"] not in referenced:
|
||
ref = f"See Annex: {name}."
|
||
referenced.add(dgm["norm"])
|
||
break
|
||
if ref:
|
||
out_lines.append(ref)
|
||
out_lines.append("")
|
||
i = end
|
||
range_idx += 1
|
||
continue
|
||
out_lines.append(lines[i])
|
||
i += 1
|
||
|
||
# Remove empty/legacy inferred-diagram annex headings (we insert our own).
|
||
legacy_annex_titles = {
|
||
"## Annex (inferred diagrams)",
|
||
"## Annexes (diagrammes inférés)",
|
||
}
|
||
cleaned: list[str] = []
|
||
in_legacy_annex = False
|
||
for ln in out_lines:
|
||
stripped = ln.strip()
|
||
if stripped in legacy_annex_titles:
|
||
in_legacy_annex = True
|
||
continue
|
||
if in_legacy_annex and stripped == "---":
|
||
in_legacy_annex = False
|
||
cleaned.append(ln)
|
||
continue
|
||
if in_legacy_annex and stripped.startswith("## "):
|
||
in_legacy_annex = False
|
||
if in_legacy_annex:
|
||
continue
|
||
cleaned.append(ln)
|
||
|
||
# Insert annex right before footer separator.
|
||
footer_sep_idx = None
|
||
in_fence = False
|
||
for idx, ln in enumerate(cleaned):
|
||
stripped = ln.strip()
|
||
if stripped.startswith("```"):
|
||
in_fence = not in_fence
|
||
continue
|
||
if in_fence:
|
||
continue
|
||
if stripped == "---":
|
||
footer_sep_idx = idx
|
||
if footer_sep_idx is None:
|
||
footer_sep_idx = len(cleaned)
|
||
|
||
if locale.lower().startswith("fr"):
|
||
annex_title = "## Annexes (actifs partagés)"
|
||
annex_note = "_Diagrammes dédupliqués : chaque schéma unique est rendu une fois ici ; les sections y renvoient._"
|
||
diag_title = "### Diagrammes (dédupliqués)"
|
||
else:
|
||
annex_title = "## Annex (shared assets)"
|
||
annex_note = "_Deduped diagrams: each unique diagram is rendered once here; sections reference it by name._"
|
||
diag_title = "### Diagrams (deduped)"
|
||
|
||
annex_lines = [annex_title, "", annex_note, "", diag_title, ""]
|
||
for dgm in unique_diagrams:
|
||
name = diagram_names.get(dgm["norm"]) or _diagram_label(dgm.get("raw", ""), locale=locale)
|
||
annex_lines.extend([f"#### {name}", "", "```mermaid", dgm.get("raw", "").rstrip(), "```", ""])
|
||
|
||
final_lines = cleaned[:footer_sep_idx] + [""] + annex_lines + cleaned[footer_sep_idx:]
|
||
return "\n".join([ln.rstrip() for ln in final_lines]).strip() + "\n"
|
||
|
||
|
||
def _merge_consecutive_sections_by_title(sections: list[_SourceSection]) -> list[_SourceSection]:
|
||
"""
|
||
Extraction sometimes yields many page-level "sections" with the same repeated header title.
|
||
For v2.0+ we merge consecutive equal-titled sections to prevent template repetition.
|
||
"""
|
||
|
||
if len(sections) < 3:
|
||
return sections
|
||
|
||
def norm(title: str) -> str:
|
||
s = " ".join((title or "").split()).strip()
|
||
s = re.sub(r"https?://\\S+", "", s).strip()
|
||
s = re.sub(r"(?i)\\bthis publication is available free of charge from\\b:?.*$", "", s).strip()
|
||
s = " ".join(s.split()).strip()
|
||
return s.upper()
|
||
|
||
merged: list[_SourceSection] = [sections[0]]
|
||
for sec in sections[1:]:
|
||
if merged and norm(sec.title) == norm(merged[-1].title):
|
||
prev = merged[-1]
|
||
body = "\n\n".join([prev.body.strip(), sec.body.strip()]).strip()
|
||
why_prev = (prev.why_it_matters or "").strip()
|
||
why_new = (sec.why_it_matters or "").strip()
|
||
why = why_prev or why_new or None
|
||
if why_prev and why_new and why_new not in why_prev:
|
||
why = "\n".join([why_prev, why_new]).strip()
|
||
merged[-1] = _SourceSection(title=prev.title, body=body, why_it_matters=why)
|
||
else:
|
||
merged.append(sec)
|
||
return merged
|
||
|
||
|
||
def _merge_repeated_titles_globally(sections: list[_SourceSection], *, min_repeats: int = 3) -> list[_SourceSection]:
|
||
"""
|
||
If a title repeats many times across extracted sections, it's usually a page header.
|
||
Merge all instances into the first occurrence to avoid template repetition.
|
||
"""
|
||
|
||
if len(sections) < 3:
|
||
return sections
|
||
|
||
def norm(title: str) -> str:
|
||
s = " ".join((title or "").split()).strip()
|
||
s = re.sub(r"https?://\\S+", "", s).strip()
|
||
s = re.sub(r"(?i)\\bthis publication is available free of charge from\\b:?.*$", "", s).strip()
|
||
s = " ".join(s.split()).strip()
|
||
return s.upper()
|
||
|
||
counts: dict[str, int] = {}
|
||
for sec in sections[1:]:
|
||
key = norm(sec.title)
|
||
if not key:
|
||
continue
|
||
counts[key] = counts.get(key, 0) + 1
|
||
|
||
repeated = {k for k, n in counts.items() if n >= min_repeats}
|
||
if not repeated:
|
||
return sections
|
||
|
||
out: list[_SourceSection] = [sections[0]]
|
||
merged_by_title: dict[str, _SourceSection] = {}
|
||
out_idx_by_title: dict[str, int] = {}
|
||
for sec in sections[1:]:
|
||
key = norm(sec.title)
|
||
if key in repeated:
|
||
if key not in out_idx_by_title:
|
||
out_idx_by_title[key] = len(out)
|
||
merged_by_title[key] = sec
|
||
out.append(sec)
|
||
else:
|
||
out_idx = out_idx_by_title[key]
|
||
prev = merged_by_title[key]
|
||
body = "\n\n".join([prev.body.strip(), sec.body.strip()]).strip()
|
||
why_prev = (prev.why_it_matters or "").strip()
|
||
why_new = (sec.why_it_matters or "").strip()
|
||
why = why_prev or why_new or None
|
||
if why_prev and why_new and why_new not in why_prev:
|
||
why = "\n".join([why_prev, why_new]).strip()
|
||
merged = _SourceSection(title=prev.title, body=body, why_it_matters=why)
|
||
merged_by_title[key] = merged
|
||
out[out_idx] = merged
|
||
else:
|
||
out.append(sec)
|
||
return out
|
||
|
||
|
||
def _generate_dave_v1_8_mirror(*, source_text: str, source_path: str, action_pack: bool, locale: str, style_version: str = "v1.8") -> str:
|
||
today = _dt.date.today().isoformat()
|
||
normalized = _normalize_ocr(source_text)
|
||
extract_sha = _sha256_text(normalized)
|
||
source_file_sha = _sha256_file(source_path) if Path(source_path).exists() else "unknown"
|
||
ctx = _RenderContext(seed=extract_sha, locale=locale, voice=style_version)
|
||
|
||
# v1.8+ defaults Action Pack ON unless explicitly disabled.
|
||
action_pack_enabled = (not _truthy_env("REVOICE_NO_ACTION_PACK")) or bool(action_pack) or _truthy_env("REVOICE_ACTION_PACK")
|
||
|
||
sections = _extract_sections(normalized)
|
||
if style_version in {"v2.0", "v2.1", "v2.2", "v2.3"}:
|
||
sections = _merge_consecutive_sections_by_title(sections)
|
||
sections = _merge_repeated_titles_globally(sections)
|
||
if not sections:
|
||
raise ValueError("No content extracted from source")
|
||
if len(sections) == 1:
|
||
only = sections[0]
|
||
sections = [
|
||
_SourceSection(title=only.title, body="", why_it_matters=None),
|
||
_SourceSection(title="Overview" if not locale.lower().startswith("fr") else "Aperçu", body=only.body, why_it_matters=None),
|
||
]
|
||
|
||
# Minimum content contract: mark degraded (and optionally gate-fail) instead of silently shipping emptiness.
|
||
non_empty_sections = [s for s in sections[1:] if (s.body or "").strip()]
|
||
total_body_chars = sum(len((s.body or "").strip()) for s in non_empty_sections)
|
||
# Mirror completeness is relative to source size: two-page briefs shouldn't be forced into 3+ sections.
|
||
pages = _parse_pages(normalized)
|
||
non_empty_pages = sum(1 for _no, p in pages if (p or "").strip())
|
||
page_count = max(1, non_empty_pages)
|
||
min_sections_required = min(3, max(1, page_count - 1))
|
||
min_chars_required = min(3000, max(800, page_count * 800))
|
||
|
||
frame_blocked = "unauthorized frame window" in normalized.lower()
|
||
nav_heavy = _looks_like_navigation_heavy_source(source_text)
|
||
|
||
mirror_ok = (not frame_blocked) and (not nav_heavy) and (len(non_empty_sections) >= min_sections_required) and (total_body_chars >= min_chars_required)
|
||
mirror_status = "OK" if mirror_ok else "DEGRADED"
|
||
if mirror_ok:
|
||
mirror_reason = ""
|
||
elif frame_blocked:
|
||
mirror_reason = "FRAME_BLOCKED_SOURCE"
|
||
elif nav_heavy:
|
||
mirror_reason = "NAVIGATION_HEAVY_SOURCE"
|
||
else:
|
||
mirror_reason = "INSUFFICIENT_MIRROR"
|
||
if _truthy_env("REVOICE_QUALITY_GATE") and not mirror_ok:
|
||
raise ValueError(f"QUALITY_GATE_FAILED:{mirror_reason}")
|
||
|
||
cover_lines = [ln.strip() for ln in sections[0].body.splitlines() if ln.strip()]
|
||
cover_h1 = sections[0].title.strip() or ("DOSSIER DE L’OMBRE" if locale.lower().startswith("fr") else "SHADOW DOSSIER")
|
||
cover_h2 = " ".join(cover_lines[:2]).strip() if cover_lines else ""
|
||
|
||
y, m, d = today.split("-")
|
||
report_id = f"IF-RT-DAVE-{y}-{m}{d}"
|
||
source_basename = Path(source_path).name
|
||
project_slug = _slugify(Path(source_basename).stem + "-mirror")
|
||
source_slug = _slugify(source_basename)
|
||
filename_title = Path(source_basename).stem.replace("-", " ").replace("_", " ").strip() or source_basename
|
||
source_doc_url = ""
|
||
if source_file_sha != "unknown":
|
||
source_doc_url = f"https://infrafabric.io/static/source/{source_file_sha}.pdf"
|
||
source_file_sha_short = ""
|
||
if source_file_sha != "unknown" and len(source_file_sha) >= 12:
|
||
source_file_sha_short = f"{source_file_sha[:4]}…{source_file_sha[-3:]}"
|
||
|
||
is_episode_framed = style_version in {"v2.2", "v2.3"}
|
||
week_day_heading = _week_day_name_from_source_basename(source_basename) if is_episode_framed else None
|
||
inferred_company = _infer_source_company_name(normalized_text=normalized) if is_episode_framed else ""
|
||
inferred_report_title = _infer_report_short_title(normalized_text=normalized, company=inferred_company) if is_episode_framed else ""
|
||
|
||
if (
|
||
not cover_h1
|
||
or cover_h1.upper() == "COUVERTURE"
|
||
or _looks_like_site_footer(cover_h1)
|
||
or len(cover_h1) > 96
|
||
or "." in cover_h1
|
||
):
|
||
cover_h1 = filename_title
|
||
|
||
# v2.2+: Week packs should read like a daily “episode”: stable day heading, vendor + report title.
|
||
if is_episode_framed and week_day_heading:
|
||
cover_h1 = week_day_heading
|
||
if inferred_company and inferred_report_title:
|
||
cover_h2 = f"{inferred_company} | {inferred_report_title}"
|
||
# Make slugs more meaningful than mon.pdf/tue.pdf.
|
||
if inferred_company and inferred_report_title:
|
||
project_slug = _slugify(f"{inferred_company}-{inferred_report_title}-mirror")
|
||
source_slug = _slugify(f"{inferred_company}-{source_basename}")
|
||
|
||
vertical_line = _infer_vertical_line(normalized_text=normalized, source_basename=source_basename, locale=locale)
|
||
|
||
out: list[str] = [
|
||
"---",
|
||
"BRAND: InfraFabric.io",
|
||
"UNIT: RED TEAM (STRATEGIC OPS)" if not locale.lower().startswith("fr") else "UNIT: RED TEAM (OPÉRATIONS STRATÉGIQUES)",
|
||
"DOCUMENT: SHADOW DOSSIER" if not locale.lower().startswith("fr") else "DOCUMENT: DOSSIER DE L’OMBRE",
|
||
"CLASSIFICATION: EYES ONLY // DAVE" if not locale.lower().startswith("fr") else "CLASSIFICATION: CONFIDENTIEL // DAVE",
|
||
"---",
|
||
"",
|
||
"# [ RED TEAM DECLASSIFIED ]" if not locale.lower().startswith("fr") else "# [ DÉCLASSIFIÉ – ÉQUIPE ROUGE ]",
|
||
f"## PROJECT: {project_slug}" if not locale.lower().startswith("fr") else f"## PROJET : {project_slug}",
|
||
f"### SOURCE: {source_slug}" if not locale.lower().startswith("fr") else f"### SOURCE : {source_slug}",
|
||
f"**INFRAFABRIC REPORT ID:** `{report_id}`" if not locale.lower().startswith("fr") else f"**ID DE RAPPORT INFRAFABRIC :** `{report_id}`",
|
||
(
|
||
f"**SOURCE DOC (online):** [Source PDF (sha256: {source_file_sha_short})]({source_doc_url})"
|
||
if source_doc_url and source_file_sha_short and not locale.lower().startswith("fr")
|
||
else (
|
||
f"**DOCUMENT SOURCE (en ligne) :** [PDF source (sha256 : {source_file_sha_short})]({source_doc_url})"
|
||
if source_doc_url and source_file_sha_short
|
||
else ""
|
||
)
|
||
),
|
||
"",
|
||
"> NOTICE: This document is a product of InfraFabric Red Team."
|
||
if not locale.lower().startswith("fr")
|
||
else "> AVIS : ce document est un produit de l’InfraFabric Red Team.",
|
||
"> It exposes socio-technical frictions where incentives turn controls into theater."
|
||
if not locale.lower().startswith("fr")
|
||
else "> Il expose les frictions socio-techniques : là où les incitations transforment les contrôles en théâtre.",
|
||
"",
|
||
f"**MIRROR COMPLETENESS:** {mirror_status}" if not locale.lower().startswith("fr") else f"**COMPLÉTUDE DU MIROIR :** {mirror_status}",
|
||
]
|
||
if mirror_reason:
|
||
out.append(f"**MIRROR NOTE:** {mirror_reason}" if not locale.lower().startswith("fr") else f"**NOTE MIROIR :** {mirror_reason}")
|
||
if vertical_line:
|
||
out.extend([vertical_line])
|
||
|
||
out.extend(
|
||
[
|
||
"",
|
||
"**[ ACCESS GRANTED: INFRAFABRIC RED TEAM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ ACCÈS AUTORISÉ : INFRAFABRIC ÉQUIPE ROUGE ]**",
|
||
"**[ STATUS: OPERATIONAL REALISM ]**"
|
||
if not locale.lower().startswith("fr")
|
||
else "**[ STATUT : RÉALISME OPÉRATIONNEL ]**",
|
||
"",
|
||
f"## {cover_h1}",
|
||
]
|
||
)
|
||
cover_h2_out = ""
|
||
if cover_h2:
|
||
if style_version in {"v2.0", "v2.1", "v2.2", "v2.3"}:
|
||
if not _looks_like_cover_subtitle_noise(cover_h2):
|
||
cover_h2_out = _compact_title(cover_h2, max_chars=90)
|
||
else:
|
||
cover_h2_out = cover_h2
|
||
if cover_h2_out:
|
||
out.extend([f"### {cover_h2_out}", ""])
|
||
else:
|
||
out.append("")
|
||
|
||
out.extend(
|
||
[
|
||
"> Shadow dossier (mirror-first)." if not locale.lower().startswith("fr") else "> Dossier de l’ombre (miroir d’abord).",
|
||
">",
|
||
f"> Protocol: IF.DAVE.{style_version}" if not locale.lower().startswith("fr") else f"> Protocole : IF.DAVE.{style_version}",
|
||
f"> Source: `{source_basename}`" if not locale.lower().startswith("fr") else f"> Source : `{source_basename}`",
|
||
f"> Generated: `{today}`" if not locale.lower().startswith("fr") else f"> Généré le : `{today}`",
|
||
f"> Source Hash (sha256): `{source_file_sha}`"
|
||
if not locale.lower().startswith("fr")
|
||
else f"> Empreinte source (sha256) : `{source_file_sha}`",
|
||
f"> Source URL: {source_doc_url}"
|
||
if source_doc_url and not locale.lower().startswith("fr")
|
||
else (f"> URL source : {source_doc_url}" if source_doc_url else ""),
|
||
"",
|
||
]
|
||
)
|
||
|
||
if is_episode_framed:
|
||
section_titles = [s.title for s in sections[1:] if (s.title or "").strip()]
|
||
intro_lines = _render_time_journalist_intro(
|
||
company=inferred_company or "the vendor",
|
||
report_title=inferred_report_title or cover_h2_out or cover_h1,
|
||
section_titles=section_titles,
|
||
locale=locale,
|
||
)
|
||
if intro_lines:
|
||
out.extend(intro_lines)
|
||
out.append("")
|
||
|
||
for section in sections[1:]:
|
||
if section.title.strip().upper() == "INTRODUCTION":
|
||
out.append(_render_intro(section, ctx=ctx))
|
||
else:
|
||
out.append(_render_section(section, ctx=ctx))
|
||
out.append("")
|
||
|
||
# Claims Register (source-attributed): verbatim lines only (no new claims).
|
||
claims = _extract_claim_lines(normalized_text=normalized, max_items=12)
|
||
if claims:
|
||
if locale.lower().startswith("fr"):
|
||
out.extend(["## Registre des affirmations (attribuées à la source)", "", "_La source affirme :_"])
|
||
else:
|
||
out.extend(["## Claims Register (source-attributed)", "", "_The source claims:_"])
|
||
out.append("")
|
||
for c in claims:
|
||
out.append(f"- The source claims: “{c}”" if not locale.lower().startswith("fr") else f"- La source affirme : « {c} »")
|
||
out.append("")
|
||
|
||
if _looks_like_government_standard(normalized_text=normalized, source_basename=source_basename):
|
||
table = _render_translation_table(normalized_text=normalized, locale=locale)
|
||
if table:
|
||
out.extend([table, ""])
|
||
|
||
if action_pack_enabled:
|
||
if style_version in {"v2.0", "v2.1", "v2.2", "v2.3"}:
|
||
out.append(_render_action_pack_v2_0(sections=sections[1:], normalized_text=normalized, locale=locale))
|
||
else:
|
||
out.append(_render_action_pack(sections[1:]))
|
||
out.append("")
|
||
|
||
# v1.8+ requires >=2 Mermaid diagrams; add supplemental inferred diagrams only when needed.
|
||
if locale.lower().startswith("fr"):
|
||
mermaid_section_title = "## Annexes (diagrammes inférés)"
|
||
mermaid_note = "_Diagrammes inférés : synthèse InfraFabric Red Team (sans nouvelles affirmations factuelles)._"
|
||
evidence_label = "Boucle de dérive de preuve (inférée)"
|
||
exception_label = "Stase d’exception (inférée)"
|
||
else:
|
||
mermaid_section_title = "## Annex (inferred diagrams)"
|
||
mermaid_note = "_Inferred diagrams: InfraFabric Red Team synthesis (no new factual claims)._"
|
||
evidence_label = "Evidence drift loop (inferred)"
|
||
exception_label = "Exception stasis (inferred)"
|
||
|
||
current_md = "\n".join(out)
|
||
mermaid_count = len(re.findall(r"```mermaid\\b", current_md))
|
||
if mermaid_count < 2:
|
||
# Try to anchor a diagram label to a source keyword so it doesn't look like filler.
|
||
anchor_kw = None
|
||
for kw in ["fips", "fido2", "aal3", "retention", "enforcer", "zero trust", "pdp", "pep", "agentic", "autonomous"]:
|
||
if kw in normalized.lower():
|
||
anchor_kw = kw.upper() if kw.isalpha() else kw
|
||
break
|
||
|
||
out.extend([mermaid_section_title, "", mermaid_note, ""])
|
||
if mermaid_count < 1:
|
||
out.extend(
|
||
[
|
||
f"### {evidence_label}",
|
||
"",
|
||
"```mermaid",
|
||
"flowchart TD",
|
||
f" A[Control intent] --> B[Evidence requested ({anchor_kw or 'signal'})]",
|
||
" B --> C[Artifact produced]",
|
||
" C --> D[Dashboard goes green]",
|
||
" D --> E[Exceptions accumulate]",
|
||
" E --> F[Definition of compliance shifts]",
|
||
" F --> B",
|
||
"```",
|
||
"",
|
||
]
|
||
)
|
||
out.extend(
|
||
[
|
||
f"### {exception_label}",
|
||
"",
|
||
"```mermaid",
|
||
"stateDiagram-v2",
|
||
" [*] --> Requested",
|
||
" Requested --> PendingReview: needs_alignment",
|
||
" PendingReview --> PendingReview: renewal",
|
||
" PendingReview --> Approved: silence",
|
||
" Approved --> Approved: temporary_extension",
|
||
"```",
|
||
"",
|
||
]
|
||
)
|
||
|
||
out.extend(
|
||
[
|
||
"---",
|
||
"",
|
||
"*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** for socio-technical friction analysis: https://infrafabric.io"
|
||
if not locale.lower().startswith("fr")
|
||
else "*InfraFabric Red Team Footer:* **RED-TEAM Shadow Dossiers** (analyse socio-technique des frictions) : https://infrafabric.io",
|
||
"*Standard Dave Footer:* This document is intended for the recipient only. If you are not the recipient, please delete it and forget you saw anything. P.S. Please consider the environment before printing this email."
|
||
if not locale.lower().startswith("fr")
|
||
else "*Standard Dave Footer:* Ce document est destiné au seul destinataire. Si vous n’êtes pas le destinataire, veuillez le supprimer et oublier que vous l’avez vu. P.S. Veuillez considérer l’environnement avant d’imprimer ce document.",
|
||
]
|
||
)
|
||
|
||
doc = "\n".join(out).strip() + "\n"
|
||
if style_version in {"v2.0", "v2.1", "v2.2", "v2.3"}:
|
||
doc = _apply_dave_v2_0_postprocess(doc, locale=locale)
|
||
if style_version in {"v2.2", "v2.3"}:
|
||
doc = _insert_podcast_script_v2_2(
|
||
md=doc,
|
||
company=inferred_company or "the vendor",
|
||
report_title=inferred_report_title or cover_h2_out or cover_h1,
|
||
source_doc_url=source_doc_url,
|
||
locale=locale,
|
||
)
|
||
return doc
|
||
return doc
|