revoice: fix numbered sections and remove OCR labels
This commit is contained in:
parent
ad1ba2c167
commit
bbac8d998b
1 changed files with 8 additions and 2 deletions
|
|
@ -107,7 +107,7 @@ _NUMERIC_ANCHOR_RE = re.compile(
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
_OWASP_REFERENCE_ITEM_RE = re.compile(r"^\s*(?P<num>\d+)\.\s*(?P<text>\S.*)$")
|
_OWASP_REFERENCE_ITEM_RE = re.compile(r"^\s*(?P<num>\d+)\.\s*(?P<text>\S.*)$")
|
||||||
_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2})\s*$")
|
_NUMBERED_SECTION_START_RE = re.compile(r"^\s*(?P<num>\d{2,3})\s*$")
|
||||||
_ALPHA_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]")
|
_ALPHA_RE = re.compile(r"[A-Za-zÀ-ÖØ-öø-ÿ]")
|
||||||
|
|
||||||
_OWASP_LLM_SUBHEADINGS = [
|
_OWASP_LLM_SUBHEADINGS = [
|
||||||
|
|
@ -588,6 +588,11 @@ def _parse_numbered_section_page(page_text: str) -> tuple[str, str, str] | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
num = m.group("num")
|
num = m.group("num")
|
||||||
|
# Some PDFs/OCR runs concatenate the section label with a nearby page marker, e.g. `035` for section `03`.
|
||||||
|
if len(num) == 3 and num.startswith("0"):
|
||||||
|
num = num[:2]
|
||||||
|
if len(num) != 2:
|
||||||
|
return None
|
||||||
|
|
||||||
j = i + 1
|
j = i + 1
|
||||||
while j < len(lines) and not lines[j].strip():
|
while j < len(lines) and not lines[j].strip():
|
||||||
|
|
@ -2343,7 +2348,8 @@ def _render_section(section: _SourceSection, *, ctx: _RenderContext) -> str:
|
||||||
if is_valid_json:
|
if is_valid_json:
|
||||||
out.extend(["", f"```{lang}", normalized_code.strip(), "```"])
|
out.extend(["", f"```{lang}", normalized_code.strip(), "```"])
|
||||||
else:
|
else:
|
||||||
out.extend(["", "### Source snippet (OCR, unverified)", "", f"```{lang}", normalized_code.strip(), "```"])
|
# OpSec: do not label pipeline/extraction artifacts (no "OCR", "unverified", etc.).
|
||||||
|
out.extend(["", "### Source snippet", "", f"```{lang}", normalized_code.strip(), "```"])
|
||||||
|
|
||||||
report = _extract_access_report(section.body)
|
report = _extract_access_report(section.body)
|
||||||
if report:
|
if report:
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue