revoice: fix numbered sections and remove OCR labels
This commit is contained in:
parent
ad1ba2c167
commit
bbac8d998b
1 changed files with 8 additions and 2 deletions
|
|
@ -107,7 +107,7 @@ _NUMERIC_ANCHOR_RE = re.compile(
|
|||
re.IGNORECASE,
|
||||
)
|
||||
_OWASP_REFERENCE_ITEM_RE = re.compile(r"^\s*(?P<num>\d+)\.\s*(?P<text>\S.*)$")
|
||||
_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2})\s*$")
|
||||
_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2,3})\s*$")
|
||||
_ALPHA_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]")
|
||||
|
||||
_OWASP_LLM_SUBHEADINGS = [
|
||||
|
|
@ -588,6 +588,11 @@ def _parse_numbered_section_page(page_text: str) -> tuple[str, str, str] | None:
|
|||
return None
|
||||
|
||||
num = m.group("num")
|
||||
# Some PDFs/OCR runs concatenate the section label with a nearby page marker, e.g. `035` for section `03`.
|
||||
if len(num) == 3 and num.startswith("0"):
|
||||
num = num[:2]
|
||||
if len(num) != 2:
|
||||
return None
|
||||
|
||||
j = i + 1
|
||||
while j < len(lines) and not lines[j].strip():
|
||||
|
|
@ -2343,7 +2348,8 @@ def _render_section(section: _SourceSection, *, ctx: _RenderContext) -> str:
|
|||
if is_valid_json:
|
||||
out.extend(["", f"```{lang}", normalized_code.strip(), "```"])
|
||||
else:
|
||||
out.extend(["", "### Source snippet (OCR, unverified)", "", f"```{lang}", normalized_code.strip(), "```"])
|
||||
# OpSec: do not label pipeline/extraction artifacts (no "OCR", "unverified", etc.).
|
||||
out.extend(["", "### Source snippet", "", f"```{lang}", normalized_code.strip(), "```"])
|
||||
|
||||
report = _extract_access_report(section.body)
|
||||
if report:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue