123 lines
3.3 KiB
Python
123 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
|
|
def _render_page(pdf_path: Path, *, page: int, dpi: int) -> Path:
|
|
tmpdir = Path(tempfile.mkdtemp(prefix="iftypeset_chi_"))
|
|
out_prefix = tmpdir / "page"
|
|
subprocess.run(
|
|
[
|
|
"pdftoppm",
|
|
"-f",
|
|
str(page),
|
|
"-l",
|
|
str(page),
|
|
"-r",
|
|
str(dpi),
|
|
"-png",
|
|
"-singlefile",
|
|
str(pdf_path),
|
|
str(out_prefix),
|
|
],
|
|
check=True,
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
)
|
|
img = tmpdir / "page.png"
|
|
if not img.exists():
|
|
raise RuntimeError("pdftoppm did not produce expected image")
|
|
return img
|
|
|
|
|
|
def _ocr_image(img: Path, *, psm: int = 6) -> str:
|
|
# stdout mode: "tesseract <img> stdout"
|
|
proc = subprocess.run(
|
|
[
|
|
"tesseract",
|
|
str(img),
|
|
"stdout",
|
|
"--dpi",
|
|
"200",
|
|
"--psm",
|
|
str(psm),
|
|
],
|
|
check=True,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.DEVNULL,
|
|
)
|
|
return proc.stdout.decode("utf-8", errors="replace")
|
|
|
|
|
|
def _cleanup(path: Path) -> None:
|
|
try:
|
|
if path.is_file():
|
|
path.unlink(missing_ok=True)
|
|
else:
|
|
for p in path.rglob("*"):
|
|
if p.is_file():
|
|
p.unlink(missing_ok=True)
|
|
for p in sorted(path.rglob("*"), reverse=True):
|
|
if p.is_dir():
|
|
p.rmdir()
|
|
path.rmdir()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def ocr_page(pdf_path: Path, *, page: int, dpi: int) -> str:
|
|
img = _render_page(pdf_path, page=page, dpi=dpi)
|
|
try:
|
|
return _ocr_image(img)
|
|
finally:
|
|
_cleanup(img.parent)
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser(description="Ephemeral OCR helper for Chicago scan PDF (prints to stdout; no files).")
|
|
ap.add_argument("pdf", type=Path, help="Path to Chicago scan PDF")
|
|
ap.add_argument("--page", type=int, required=True, help="PDF page number to OCR (1-based)")
|
|
ap.add_argument("--dpi", type=int, default=200)
|
|
ap.add_argument("--grep", default="", help="Optional regex filter; print only matching lines")
|
|
ap.add_argument(
|
|
"--max-lines",
|
|
type=int,
|
|
default=40,
|
|
help="Max number of lines to print (safety guard; applies after grep).",
|
|
)
|
|
ap.add_argument(
|
|
"--unsafe-print-all",
|
|
action="store_true",
|
|
help="DANGEROUS: print full OCR output (avoid; may capture large copyrighted text).",
|
|
)
|
|
args = ap.parse_args()
|
|
|
|
if not args.pdf.exists():
|
|
raise SystemExit(f"Missing PDF: {args.pdf}")
|
|
|
|
if not args.grep and not args.unsafe_print_all:
|
|
raise SystemExit("Refusing to print full OCR output without --grep (use --unsafe-print-all to override).")
|
|
|
|
text = ocr_page(args.pdf, page=args.page, dpi=args.dpi)
|
|
if args.grep:
|
|
r = re.compile(args.grep, re.IGNORECASE)
|
|
printed = 0
|
|
for ln in text.splitlines():
|
|
if not r.search(ln):
|
|
continue
|
|
print(ln)
|
|
printed += 1
|
|
if printed >= args.max_lines:
|
|
break
|
|
else:
|
|
print(text)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|