iftypeset/tools/chicago_ocr.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import os
import re
import subprocess
import tempfile
from pathlib import Path


def _render_page(pdf_path: Path, *, page: int, dpi: int) -> Path:
    tmpdir = Path(tempfile.mkdtemp(prefix="iftypeset_chi_"))
    out_prefix = tmpdir / "page"
    subprocess.run(
        [
            "pdftoppm",
            "-f",
            str(page),
            "-l",
            str(page),
            "-r",
            str(dpi),
            "-png",
            "-singlefile",
            str(pdf_path),
            str(out_prefix),
        ],
        check=True,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )
    img = tmpdir / "page.png"
    if not img.exists():
        raise RuntimeError("pdftoppm did not produce expected image")
    return img


def _ocr_image(img: Path, *, psm: int = 6) -> str:
    # stdout mode: "tesseract <img> stdout"
    proc = subprocess.run(
        [
            "tesseract",
            str(img),
            "stdout",
            "--dpi",
            "200",
            "--psm",
            str(psm),
        ],
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.DEVNULL,
    )
    return proc.stdout.decode("utf-8", errors="replace")


def _cleanup(path: Path) -> None:
    try:
        if path.is_file():
            path.unlink(missing_ok=True)
        else:
            for p in path.rglob("*"):
                if p.is_file():
                    p.unlink(missing_ok=True)
            for p in sorted(path.rglob("*"), reverse=True):
                if p.is_dir():
                    p.rmdir()
            path.rmdir()
    except Exception:
        pass


def ocr_page(pdf_path: Path, *, page: int, dpi: int) -> str:
    img = _render_page(pdf_path, page=page, dpi=dpi)
    try:
        return _ocr_image(img)
    finally:
        _cleanup(img.parent)


def main() -> None:
    ap = argparse.ArgumentParser(description="Ephemeral OCR helper for Chicago scan PDF (prints to stdout; no files).")
    ap.add_argument("pdf", type=Path, help="Path to Chicago scan PDF")
    ap.add_argument("--page", type=int, required=True, help="PDF page number to OCR (1-based)")
    ap.add_argument("--dpi", type=int, default=200)
    ap.add_argument("--grep", default="", help="Optional regex filter; print only matching lines")
    ap.add_argument(
        "--max-lines",
        type=int,
        default=40,
        help="Max number of lines to print (safety guard; applies after grep).",
    )
    ap.add_argument(
        "--unsafe-print-all",
        action="store_true",
        help="DANGEROUS: print full OCR output (avoid; may capture large copyrighted text).",
    )
    args = ap.parse_args()

    if not args.pdf.exists():
        raise SystemExit(f"Missing PDF: {args.pdf}")

    if not args.grep and not args.unsafe_print_all:
        raise SystemExit("Refusing to print full OCR output without --grep (use --unsafe-print-all to override).")

    text = ocr_page(args.pdf, page=args.page, dpi=args.dpi)
    if args.grep:
        r = re.compile(args.grep, re.IGNORECASE)
        printed = 0
        for ln in text.splitlines():
            if not r.search(ln):
                continue
            print(ln)
            printed += 1
            if printed >= args.max_lines:
                break
    else:
        print(text)


if __name__ == "__main__":
    main()