112 lines
3.3 KiB
Python
112 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import subprocess
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PageInfo:
|
|
pdf_page: int
|
|
printed_page: str | None
|
|
text: str
|
|
|
|
|
|
def _run_pdftotext(pdf_path: Path) -> str:
|
|
proc = subprocess.run(
|
|
["pdftotext", str(pdf_path), "-"],
|
|
check=True,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
)
|
|
return proc.stdout.decode("utf-8", errors="replace")
|
|
|
|
|
|
_PRINTED_PAGE_RE = re.compile(r"^\d{1,4}$")
|
|
_SECTION_HEADING_RE = re.compile(r"^\s*(\d+(?:\.\d+){1,3})\s+(.+?)\s*$")
|
|
|
|
|
|
def _guess_printed_page(page_text: str) -> str | None:
|
|
lines = [ln.strip() for ln in page_text.splitlines() if ln.strip()]
|
|
for ln in reversed(lines[-8:]): # bottom-ish
|
|
if _PRINTED_PAGE_RE.match(ln):
|
|
return ln
|
|
return None
|
|
|
|
|
|
def _collect_section_headings(page_text: str) -> list[tuple[str, str]]:
|
|
out: list[tuple[str, str]] = []
|
|
for ln in page_text.splitlines():
|
|
m = _SECTION_HEADING_RE.match(ln)
|
|
if m:
|
|
out.append((m.group(1), m.group(2)))
|
|
return out
|
|
|
|
|
|
def iter_pages(pdf_path: Path) -> list[PageInfo]:
|
|
text = _run_pdftotext(pdf_path)
|
|
# pdftotext uses form-feed between pages
|
|
raw_pages = text.split("\f")
|
|
pages: list[PageInfo] = []
|
|
for i, p in enumerate(raw_pages, start=1):
|
|
if not p.strip():
|
|
continue
|
|
pages.append(PageInfo(pdf_page=i, printed_page=_guess_printed_page(p), text=p))
|
|
return pages
|
|
|
|
|
|
def _snippet(page_text: str, keyword_re: re.Pattern[str], *, max_lines: int = 4) -> str:
|
|
lines = page_text.splitlines()
|
|
hits: list[int] = []
|
|
for i, ln in enumerate(lines):
|
|
if keyword_re.search(ln):
|
|
hits.append(i)
|
|
if not hits:
|
|
return ""
|
|
start = max(0, hits[0] - 1)
|
|
end = min(len(lines), hits[0] + 2)
|
|
snippet_lines = [ln.rstrip() for ln in lines[start:end] if ln.strip()]
|
|
return "\n".join(snippet_lines[:max_lines]).strip()
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="Ephemeral locator for Bringhurst PDF (no output files).")
|
|
ap.add_argument("pdf", type=Path, help="Path to Bringhurst PDF")
|
|
ap.add_argument("keyword", help="Case-insensitive keyword/regex to search for")
|
|
ap.add_argument("--limit", type=int, default=50, help="Max hits to print")
|
|
ap.add_argument("--show-headings", action="store_true", help="Print section-number headings found on each hit page")
|
|
args = ap.parse_args()
|
|
|
|
pdf_path: Path = args.pdf
|
|
if not pdf_path.exists():
|
|
raise SystemExit(f"Missing PDF: {pdf_path}")
|
|
|
|
pages = iter_pages(pdf_path)
|
|
kw = re.compile(args.keyword, re.IGNORECASE)
|
|
|
|
printed = 0
|
|
for p in pages:
|
|
if not kw.search(p.text):
|
|
continue
|
|
printed += 1
|
|
pp = p.printed_page or "?"
|
|
print(f"\n=== hit {printed} — pdf_page={p.pdf_page} printed_p={pp} ===")
|
|
sn = _snippet(p.text, kw)
|
|
if sn:
|
|
print(sn)
|
|
if args.show_headings:
|
|
heads = _collect_section_headings(p.text)
|
|
if heads:
|
|
print("\nSection headings on page:")
|
|
for sec, title in heads[:12]:
|
|
print(f"- {sec} {title}")
|
|
if printed >= args.limit:
|
|
break
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|