iftypeset/tools/bringhurst_locate.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import re
import subprocess
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class PageInfo:
    pdf_page: int
    printed_page: str | None
    text: str


def _run_pdftotext(pdf_path: Path) -> str:
    proc = subprocess.run(
        ["pdftotext", str(pdf_path), "-"],
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    return proc.stdout.decode("utf-8", errors="replace")


_PRINTED_PAGE_RE = re.compile(r"^\d{1,4}$")
_SECTION_HEADING_RE = re.compile(r"^\s*(\d+(?:\.\d+){1,3})\s+(.+?)\s*$")


def _guess_printed_page(page_text: str) -> str | None:
    lines = [ln.strip() for ln in page_text.splitlines() if ln.strip()]
    for ln in reversed(lines[-8:]):  # bottom-ish
        if _PRINTED_PAGE_RE.match(ln):
            return ln
    return None


def _collect_section_headings(page_text: str) -> list[tuple[str, str]]:
    out: list[tuple[str, str]] = []
    for ln in page_text.splitlines():
        m = _SECTION_HEADING_RE.match(ln)
        if m:
            out.append((m.group(1), m.group(2)))
    return out


def iter_pages(pdf_path: Path) -> list[PageInfo]:
    text = _run_pdftotext(pdf_path)
    # pdftotext uses form-feed between pages
    raw_pages = text.split("\f")
    pages: list[PageInfo] = []
    for i, p in enumerate(raw_pages, start=1):
        if not p.strip():
            continue
        pages.append(PageInfo(pdf_page=i, printed_page=_guess_printed_page(p), text=p))
    return pages


def _snippet(page_text: str, keyword_re: re.Pattern[str], *, max_lines: int = 4) -> str:
    lines = page_text.splitlines()
    hits: list[int] = []
    for i, ln in enumerate(lines):
        if keyword_re.search(ln):
            hits.append(i)
    if not hits:
        return ""
    start = max(0, hits[0] - 1)
    end = min(len(lines), hits[0] + 2)
    snippet_lines = [ln.rstrip() for ln in lines[start:end] if ln.strip()]
    return "\n".join(snippet_lines[:max_lines]).strip()


def main() -> None:
    ap = argparse.ArgumentParser(description="Ephemeral locator for Bringhurst PDF (no output files).")
    ap.add_argument("pdf", type=Path, help="Path to Bringhurst PDF")
    ap.add_argument("keyword", help="Case-insensitive keyword/regex to search for")
    ap.add_argument("--limit", type=int, default=50, help="Max hits to print")
    ap.add_argument("--show-headings", action="store_true", help="Print section-number headings found on each hit page")
    args = ap.parse_args()

    pdf_path: Path = args.pdf
    if not pdf_path.exists():
        raise SystemExit(f"Missing PDF: {pdf_path}")

    pages = iter_pages(pdf_path)
    kw = re.compile(args.keyword, re.IGNORECASE)

    printed = 0
    for p in pages:
        if not kw.search(p.text):
            continue
        printed += 1
        pp = p.printed_page or "?"
        print(f"\n=== hit {printed} — pdf_page={p.pdf_page} printed_p={pp} ===")
        sn = _snippet(p.text, kw)
        if sn:
            print(sn)
        if args.show_headings:
            heads = _collect_section_headings(p.text)
            if heads:
                print("\nSection headings on page:")
                for sec, title in heads[:12]:
                    print(f"- {sec} {title}")
        if printed >= args.limit:
            break


if __name__ == "__main__":
    main()