diff --git a/src/revoice/generate.py b/src/revoice/generate.py index 8e7593c..216c5f1 100644 --- a/src/revoice/generate.py +++ b/src/revoice/generate.py @@ -107,7 +107,7 @@ _NUMERIC_ANCHOR_RE = re.compile( re.IGNORECASE, ) _OWASP_REFERENCE_ITEM_RE = re.compile(r"^\s*(?P\d+)\.\s*(?P\S.*)$") -_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P\d{2})\s*$") +_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P\d{2,3})\s*$") _ALPHA_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]") _OWASP_LLM_SUBHEADINGS = [ @@ -588,6 +588,11 @@ def _parse_numbered_section_page(page_text: str) -> tuple[str, str, str] | None: return None num = m.group("num") + # Some PDFs/OCR runs concatenate the section label with a nearby page marker, e.g. `035` for section `03`. + if len(num) == 3 and num.startswith("0"): + num = num[:2] + if len(num) != 2: + return None j = i + 1 while j < len(lines) and not lines[j].strip(): @@ -2343,7 +2348,8 @@ def _render_section(section: _SourceSection, *, ctx: _RenderContext) -> str: if is_valid_json: out.extend(["", f"```{lang}", normalized_code.strip(), "```"]) else: - out.extend(["", "### Source snippet (OCR, unverified)", "", f"```{lang}", normalized_code.strip(), "```"]) + # OpSec: do not label pipeline/extraction artifacts (no "OCR", "unverified", etc.). + out.extend(["", "### Source snippet", "", f"```{lang}", normalized_code.strip(), "```"]) report = _extract_access_report(section.body) if report: