revoice: fix numbered sections and remove OCR labels

This commit is contained in:
root 2025-12-27 03:00:02 +00:00
parent ad1ba2c167
commit bbac8d998b

View file

@ -107,7 +107,7 @@ _NUMERIC_ANCHOR_RE = re.compile(
re.IGNORECASE,
)
_OWASP_REFERENCE_ITEM_RE = re.compile(r"^\s*(?P<num>\d+)\.\s*(?P<text>\S.*)$")
_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2})\s*$")
_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2,3})\s*$")
_ALPHA_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]")
_OWASP_LLM_SUBHEADINGS = [
@ -588,6 +588,11 @@ def _parse_numbered_section_page(page_text: str) -> tuple[str, str, str] | None:
return None
num = m.group("num")
# Some PDFs/OCR runs concatenate the section label with a nearby page marker, e.g. `035` for section `03`.
if len(num) == 3 and num.startswith("0"):
num = num[:2]
if len(num) != 2:
return None
j = i + 1
while j < len(lines) and not lines[j].strip():
@ -2343,7 +2348,8 @@ def _render_section(section: _SourceSection, *, ctx: _RenderContext) -> str:
if is_valid_json:
out.extend(["", f"```{lang}", normalized_code.strip(), "```"])
else:
out.extend(["", "### Source snippet (OCR, unverified)", "", f"```{lang}", normalized_code.strip(), "```"])
# OpSec: do not label pipeline/extraction artifacts (no "OCR", "unverified", etc.).
out.extend(["", "### Source snippet", "", f"```{lang}", normalized_code.strip(), "```"])
report = _extract_access_report(section.body)
if report: