#!/usr/bin/env python3 from __future__ import annotations import argparse import os import re import subprocess import tempfile from pathlib import Path def _render_page(pdf_path: Path, *, page: int, dpi: int) -> Path: tmpdir = Path(tempfile.mkdtemp(prefix="iftypeset_chi_")) out_prefix = tmpdir / "page" subprocess.run( [ "pdftoppm", "-f", str(page), "-l", str(page), "-r", str(dpi), "-png", "-singlefile", str(pdf_path), str(out_prefix), ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) img = tmpdir / "page.png" if not img.exists(): raise RuntimeError("pdftoppm did not produce expected image") return img def _ocr_image(img: Path, *, psm: int = 6) -> str: # stdout mode: "tesseract stdout" proc = subprocess.run( [ "tesseract", str(img), "stdout", "--dpi", "200", "--psm", str(psm), ], check=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, ) return proc.stdout.decode("utf-8", errors="replace") def _cleanup(path: Path) -> None: try: if path.is_file(): path.unlink(missing_ok=True) else: for p in path.rglob("*"): if p.is_file(): p.unlink(missing_ok=True) for p in sorted(path.rglob("*"), reverse=True): if p.is_dir(): p.rmdir() path.rmdir() except Exception: pass def ocr_page(pdf_path: Path, *, page: int, dpi: int) -> str: img = _render_page(pdf_path, page=page, dpi=dpi) try: return _ocr_image(img) finally: _cleanup(img.parent) def main() -> None: ap = argparse.ArgumentParser(description="Ephemeral OCR helper for Chicago scan PDF (prints to stdout; no files).") ap.add_argument("pdf", type=Path, help="Path to Chicago scan PDF") ap.add_argument("--page", type=int, required=True, help="PDF page number to OCR (1-based)") ap.add_argument("--dpi", type=int, default=200) ap.add_argument("--grep", default="", help="Optional regex filter; print only matching lines") ap.add_argument( "--max-lines", type=int, default=40, help="Max number of lines to print (safety guard; applies after grep).", ) ap.add_argument( "--unsafe-print-all", action="store_true", help="DANGEROUS: print full OCR output (avoid; may capture large copyrighted text).", ) args = ap.parse_args() if not args.pdf.exists(): raise SystemExit(f"Missing PDF: {args.pdf}") if not args.grep and not args.unsafe_print_all: raise SystemExit("Refusing to print full OCR output without --grep (use --unsafe-print-all to override).") text = ocr_page(args.pdf, page=args.page, dpi=args.dpi) if args.grep: r = re.compile(args.grep, re.IGNORECASE) printed = 0 for ln in text.splitlines(): if not r.search(ln): continue print(ln) printed += 1 if printed >= args.max_lines: break else: print(text) if __name__ == "__main__": main()