#!/usr/bin/env python3 from __future__ import annotations import argparse import re import subprocess from dataclasses import dataclass from pathlib import Path @dataclass(frozen=True) class PageInfo: pdf_page: int printed_page: str | None text: str def _run_pdftotext(pdf_path: Path) -> str: proc = subprocess.run( ["pdftotext", str(pdf_path), "-"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) return proc.stdout.decode("utf-8", errors="replace") _PRINTED_PAGE_RE = re.compile(r"^\d{1,4}$") _SECTION_HEADING_RE = re.compile(r"^\s*(\d+(?:\.\d+){1,3})\s+(.+?)\s*$") def _guess_printed_page(page_text: str) -> str | None: lines = [ln.strip() for ln in page_text.splitlines() if ln.strip()] for ln in reversed(lines[-8:]): # bottom-ish if _PRINTED_PAGE_RE.match(ln): return ln return None def _collect_section_headings(page_text: str) -> list[tuple[str, str]]: out: list[tuple[str, str]] = [] for ln in page_text.splitlines(): m = _SECTION_HEADING_RE.match(ln) if m: out.append((m.group(1), m.group(2))) return out def iter_pages(pdf_path: Path) -> list[PageInfo]: text = _run_pdftotext(pdf_path) # pdftotext uses form-feed between pages raw_pages = text.split("\f") pages: list[PageInfo] = [] for i, p in enumerate(raw_pages, start=1): if not p.strip(): continue pages.append(PageInfo(pdf_page=i, printed_page=_guess_printed_page(p), text=p)) return pages def _snippet(page_text: str, keyword_re: re.Pattern[str], *, max_lines: int = 4) -> str: lines = page_text.splitlines() hits: list[int] = [] for i, ln in enumerate(lines): if keyword_re.search(ln): hits.append(i) if not hits: return "" start = max(0, hits[0] - 1) end = min(len(lines), hits[0] + 2) snippet_lines = [ln.rstrip() for ln in lines[start:end] if ln.strip()] return "\n".join(snippet_lines[:max_lines]).strip() def main() -> None: ap = argparse.ArgumentParser(description="Ephemeral locator for Bringhurst PDF (no output files).") ap.add_argument("pdf", type=Path, help="Path to Bringhurst PDF") ap.add_argument("keyword", help="Case-insensitive keyword/regex to search for") ap.add_argument("--limit", type=int, default=50, help="Max hits to print") ap.add_argument("--show-headings", action="store_true", help="Print section-number headings found on each hit page") args = ap.parse_args() pdf_path: Path = args.pdf if not pdf_path.exists(): raise SystemExit(f"Missing PDF: {pdf_path}") pages = iter_pages(pdf_path) kw = re.compile(args.keyword, re.IGNORECASE) printed = 0 for p in pages: if not kw.search(p.text): continue printed += 1 pp = p.printed_page or "?" print(f"\n=== hit {printed} — pdf_page={p.pdf_page} printed_p={pp} ===") sn = _snippet(p.text, kw) if sn: print(sn) if args.show_headings: heads = _collect_section_headings(p.text) if heads: print("\nSection headings on page:") for sec, title in heads[:12]: print(f"- {sec} {title}") if printed >= args.limit: break if __name__ == "__main__": main()