revoice: fix numbered sections and remove OCR labels

2025-12-27 03:00:02 +00:00 · 2025-12-27 03:00:02 +00:00 · bbac8d998b
commit bbac8d998b
parent ad1ba2c167
1 changed files with 8 additions and 2 deletions
--- a/src/revoice/generate.py
+++ b/src/revoice/generate.py
@ -107,7 +107,7 @@ _NUMERIC_ANCHOR_RE = re.compile(
    re.IGNORECASE,
 )
 _OWASP_REFERENCE_ITEM_RE = re.compile(r"^\s*(?P<num>\d+)\.\s*(?P<text>\S.*)$")
-_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2})\s*$")
+_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2,3})\s*$")
 _ALPHA_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]")

 _OWASP_LLM_SUBHEADINGS = [
@ -588,6 +588,11 @@ def _parse_numbered_section_page(page_text: str) -> tuple[str, str, str] | None:
        return None

    num = m.group("num")
+    # Some PDFs/OCR runs concatenate the section label with a nearby page marker, e.g. `035` for section `03`.
+    if len(num) == 3 and num.startswith("0"):
+        num = num[:2]
+    if len(num) != 2:
+        return None

    j = i + 1
    while j < len(lines) and not lines[j].strip():
@ -2343,7 +2348,8 @@ def _render_section(section: _SourceSection, *, ctx: _RenderContext) -> str:
        if is_valid_json:
            out.extend(["", f"```{lang}", normalized_code.strip(), "```"])
        else:
-            out.extend(["", "### Source snippet (OCR, unverified)", "", f"```{lang}", normalized_code.strip(), "```"])
+            # OpSec: do not label pipeline/extraction artifacts (no "OCR", "unverified", etc.).
+            out.extend(["", "### Source snippet", "", f"```{lang}", normalized_code.strip(), "```"])

    report = _extract_access_report(section.body)
    if report: