iftypeset/tools/bringhurst_locate.py
2026-01-03 20:29:35 +00:00

112 lines
3.3 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import re
import subprocess
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class PageInfo:
pdf_page: int
printed_page: str | None
text: str
def _run_pdftotext(pdf_path: Path) -> str:
proc = subprocess.run(
["pdftotext", str(pdf_path), "-"],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
return proc.stdout.decode("utf-8", errors="replace")
_PRINTED_PAGE_RE = re.compile(r"^\d{1,4}$")
_SECTION_HEADING_RE = re.compile(r"^\s*(\d+(?:\.\d+){1,3})\s+(.+?)\s*$")
def _guess_printed_page(page_text: str) -> str | None:
lines = [ln.strip() for ln in page_text.splitlines() if ln.strip()]
for ln in reversed(lines[-8:]): # bottom-ish
if _PRINTED_PAGE_RE.match(ln):
return ln
return None
def _collect_section_headings(page_text: str) -> list[tuple[str, str]]:
out: list[tuple[str, str]] = []
for ln in page_text.splitlines():
m = _SECTION_HEADING_RE.match(ln)
if m:
out.append((m.group(1), m.group(2)))
return out
def iter_pages(pdf_path: Path) -> list[PageInfo]:
text = _run_pdftotext(pdf_path)
# pdftotext uses form-feed between pages
raw_pages = text.split("\f")
pages: list[PageInfo] = []
for i, p in enumerate(raw_pages, start=1):
if not p.strip():
continue
pages.append(PageInfo(pdf_page=i, printed_page=_guess_printed_page(p), text=p))
return pages
def _snippet(page_text: str, keyword_re: re.Pattern[str], *, max_lines: int = 4) -> str:
lines = page_text.splitlines()
hits: list[int] = []
for i, ln in enumerate(lines):
if keyword_re.search(ln):
hits.append(i)
if not hits:
return ""
start = max(0, hits[0] - 1)
end = min(len(lines), hits[0] + 2)
snippet_lines = [ln.rstrip() for ln in lines[start:end] if ln.strip()]
return "\n".join(snippet_lines[:max_lines]).strip()
def main() -> None:
ap = argparse.ArgumentParser(description="Ephemeral locator for Bringhurst PDF (no output files).")
ap.add_argument("pdf", type=Path, help="Path to Bringhurst PDF")
ap.add_argument("keyword", help="Case-insensitive keyword/regex to search for")
ap.add_argument("--limit", type=int, default=50, help="Max hits to print")
ap.add_argument("--show-headings", action="store_true", help="Print section-number headings found on each hit page")
args = ap.parse_args()
pdf_path: Path = args.pdf
if not pdf_path.exists():
raise SystemExit(f"Missing PDF: {pdf_path}")
pages = iter_pages(pdf_path)
kw = re.compile(args.keyword, re.IGNORECASE)
printed = 0
for p in pages:
if not kw.search(p.text):
continue
printed += 1
pp = p.printed_page or "?"
print(f"\n=== hit {printed} — pdf_page={p.pdf_page} printed_p={pp} ===")
sn = _snippet(p.text, kw)
if sn:
print(sn)
if args.show_headings:
heads = _collect_section_headings(p.text)
if heads:
print("\nSection headings on page:")
for sec, title in heads[:12]:
print(f"- {sec} {title}")
if printed >= args.limit:
break
if __name__ == "__main__":
main()