iftypeset/tools/chicago_ocr.py
2026-01-03 20:29:35 +00:00

123 lines
3.3 KiB
Python

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import os
import re
import subprocess
import tempfile
from pathlib import Path
def _render_page(pdf_path: Path, *, page: int, dpi: int) -> Path:
tmpdir = Path(tempfile.mkdtemp(prefix="iftypeset_chi_"))
out_prefix = tmpdir / "page"
subprocess.run(
[
"pdftoppm",
"-f",
str(page),
"-l",
str(page),
"-r",
str(dpi),
"-png",
"-singlefile",
str(pdf_path),
str(out_prefix),
],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
img = tmpdir / "page.png"
if not img.exists():
raise RuntimeError("pdftoppm did not produce expected image")
return img
def _ocr_image(img: Path, *, psm: int = 6) -> str:
# stdout mode: "tesseract <img> stdout"
proc = subprocess.run(
[
"tesseract",
str(img),
"stdout",
"--dpi",
"200",
"--psm",
str(psm),
],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
)
return proc.stdout.decode("utf-8", errors="replace")
def _cleanup(path: Path) -> None:
try:
if path.is_file():
path.unlink(missing_ok=True)
else:
for p in path.rglob("*"):
if p.is_file():
p.unlink(missing_ok=True)
for p in sorted(path.rglob("*"), reverse=True):
if p.is_dir():
p.rmdir()
path.rmdir()
except Exception:
pass
def ocr_page(pdf_path: Path, *, page: int, dpi: int) -> str:
img = _render_page(pdf_path, page=page, dpi=dpi)
try:
return _ocr_image(img)
finally:
_cleanup(img.parent)
def main() -> None:
ap = argparse.ArgumentParser(description="Ephemeral OCR helper for Chicago scan PDF (prints to stdout; no files).")
ap.add_argument("pdf", type=Path, help="Path to Chicago scan PDF")
ap.add_argument("--page", type=int, required=True, help="PDF page number to OCR (1-based)")
ap.add_argument("--dpi", type=int, default=200)
ap.add_argument("--grep", default="", help="Optional regex filter; print only matching lines")
ap.add_argument(
"--max-lines",
type=int,
default=40,
help="Max number of lines to print (safety guard; applies after grep).",
)
ap.add_argument(
"--unsafe-print-all",
action="store_true",
help="DANGEROUS: print full OCR output (avoid; may capture large copyrighted text).",
)
args = ap.parse_args()
if not args.pdf.exists():
raise SystemExit(f"Missing PDF: {args.pdf}")
if not args.grep and not args.unsafe_print_all:
raise SystemExit("Refusing to print full OCR output without --grep (use --unsafe-print-all to override).")
text = ocr_page(args.pdf, page=args.page, dpi=args.dpi)
if args.grep:
r = re.compile(args.grep, re.IGNORECASE)
printed = 0
for ln in text.splitlines():
if not r.search(ln):
continue
print(ln)
printed += 1
if printed >= args.max_lines:
break
else:
print(text)
if __name__ == "__main__":
main()