import argparse import json import subprocess import sys import tempfile import unittest from pathlib import Path from unittest.mock import patch ROOT = Path(__file__).resolve().parents[1] SRC = ROOT / "src" sys.path.insert(0, str(SRC)) from iftypeset import cli as cli_mod # noqa: E402 from iftypeset.qa import ( _detect_caption_separation_pdf, _detect_heading_proximity_pdf, _detect_list_breaks_pdf, _detect_pdf_bbox_overflow, _detect_pdf_overfull_lines, _detect_runt_final_page, _detect_stranded_headings_pdf, _detect_widows_orphans, analyze_html, evaluate_gates, ) # noqa: E402 from iftypeset.spec_loader import load_spec # noqa: E402 class QATests(unittest.TestCase): def test_analyze_html_metrics(self) -> None: spec = load_spec(ROOT / "spec") profile = spec.profiles["web_pdf"] html_text = """

1 Intro

1.2 Skip

Paragraph with a very long URL https://example.com/this/is/a/very/long/url/that/should/trigger/and/keep/going/without/breaks

aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

""" report = analyze_html(html_text, profile) self.assertIn("max_link_wrap_incidents", report.metrics) self.assertGreaterEqual(report.metrics["max_link_wrap_incidents"], 1) def test_analyze_html_finds_bare_links_overfull_and_tables(self) -> None: spec = load_spec(ROOT / "spec") profile = spec.profiles["web_pdf"] html_text = """

Long bare URL https://example.com/this/is/an/extraordinarily/long/url/with/no/breaks/that/should/trigger/qa

Overfull token aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb

A	B	C	D	E	F	G	H	I

""" report = analyze_html(html_text, profile) self.assertGreaterEqual(report.metrics.get("max_link_wrap_incidents", 0), 1) self.assertGreaterEqual(report.metrics.get("max_overfull_lines", 0), 1) self.assertGreaterEqual(report.metrics.get("max_code_overflow_incidents", 0), 1) self.assertGreaterEqual(report.metrics.get("max_table_overflow_incidents", 0), 1) kinds = {i.kind for i in report.incidents} self.assertIn("link_wrap", kinds) self.assertIn("overfull_token", kinds) self.assertIn("code_overflow", kinds) self.assertIn("table_overflow", kinds) def test_gate_evaluation(self) -> None: metrics = {"max_link_wrap_incidents": 3} gates = {"max_link_wrap_incidents": 1} result = evaluate_gates(metrics, gates, profile_id="web_pdf", strict=False) self.assertFalse(result["ok"]) def test_detect_widows_orphans_from_pages(self) -> None: pages = [ "Intro line\n\nOrphan line", "Widow line\n\nNext paragraph line 1\nNext paragraph line 2", ] widows, orphans, incidents = _detect_widows_orphans(pages) self.assertEqual(orphans[0], 1) self.assertEqual(widows[1], 1) kinds = {i.kind for i in incidents} self.assertIn("orphan_pdf", kinds) self.assertIn("widow_pdf", kinds) def test_detect_stranded_headings_pdf(self) -> None: pages = [ "Body text line\n\nHeading Near End\n", "Next page line 1\nNext page line 2\n", ] counts, incidents = _detect_stranded_headings_pdf(pages, keep_lines=2, heading_max_chars=40) self.assertEqual(counts[0], 1) kinds = {i.kind for i in incidents} self.assertIn("stranded_heading_pdf", kinds) def test_detect_pdf_overfull_lines(self) -> None: pages = [ "Short line\n" + ("A" * 80), "Another page\n", ] counts, incidents = _detect_pdf_overfull_lines(pages, max_chars=50) self.assertEqual(counts[0], 1) kinds = {i.kind for i in incidents} self.assertIn("overfull_line_pdf", kinds) def test_detect_pdf_bbox_overflow(self) -> None: pages = [ { "number": 1, "width": 100.0, "height": 100.0, "texts": [{"left": 10, "top": 10, "width": 20, "height": 10, "text": "OK"}], }, { "number": 2, "width": 100.0, "height": 100.0, "texts": [{"left": 95, "top": 10, "width": 10, "height": 10, "text": "Overflow"}], }, ] counts, incidents = _detect_pdf_bbox_overflow(pages, tolerance=1.0) self.assertEqual(counts[0], 0) self.assertEqual(counts[1], 1) kinds = {i.kind for i in incidents} self.assertIn("overfull_bbox_pdf", kinds) def test_detect_caption_separation_pdf(self) -> None: pages = [ "Figure 1. Caption at top\nMore text line\n", "Body text\n", ] counts, incidents = _detect_caption_separation_pdf(pages) self.assertEqual(counts[0], 1) kinds = {i.kind for i in incidents} self.assertIn("caption_separation_pdf", kinds) def test_detect_list_breaks_pdf(self) -> None: pages = [ "We consider the following:\n", "1. First item\n2. Second item\n", ] counts, incidents = _detect_list_breaks_pdf(pages) self.assertEqual(counts[0], 1) kinds = {i.kind for i in incidents} self.assertIn("list_intro_separated_pdf", kinds) pages = [ "Body text\n\n- Only item\n", ] counts, incidents = _detect_list_breaks_pdf(pages) self.assertEqual(counts[0], 1) kinds = {i.kind for i in incidents} self.assertIn("list_lonely_item_pdf", kinds) def test_detect_heading_proximity_pdf(self) -> None: pages = [ "Heading One\n\nHeading Two\n\nBody text\n", "Next page\n", ] counts, incidents = _detect_heading_proximity_pdf(pages, heading_max_chars=40) self.assertEqual(counts[0], 1) kinds = {i.kind for i in incidents} self.assertIn("heading_proximity_pdf", kinds) def test_detect_runt_final_page_pdf(self) -> None: pages = [ "\n".join([f"Line {i}" for i in range(1, 16)]), "Last line\nSecond line\n", ] count, incidents = _detect_runt_final_page(pages, min_ratio=0.33, min_lines=4) self.assertEqual(count, 1) kinds = {i.kind for i in incidents} self.assertIn("runt_final_page_pdf", kinds) def test_qa_merges_pdf_metrics_when_present(self) -> None: spec = load_spec(ROOT / "spec") profile = spec.profiles["web_pdf"] pdf_text = "Intro line\n\nOrphan line.\fWidow line\n\nNext paragraph line" def fake_run(cmd, check, stdout, stderr): # type: ignore[no-untyped-def] if cmd[0] == "pdfinfo": return subprocess.CompletedProcess(cmd, 0, stdout=b"Pages: 2\n", stderr=b"") if cmd[0] == "pdftotext": return subprocess.CompletedProcess(cmd, 0, stdout=pdf_text.encode("utf-8"), stderr=b"") if cmd[0] == "pdftohtml": xml = ( "" "" "OK" "" "" "OK" "" "" ) return subprocess.CompletedProcess(cmd, 0, stdout=xml.encode("utf-8"), stderr=b"") raise AssertionError(f"Unexpected command: {cmd}") with tempfile.TemporaryDirectory() as tmp: out_dir = Path(tmp) html_path = out_dir / "render.html" pdf_path = out_dir / "render.pdf" html_path.write_text("

", encoding="utf-8") pdf_path.write_bytes(b"%PDF-1.4\n%stub\n") args = argparse.Namespace( spec=str(ROOT / "spec"), out=str(out_dir), html=str(html_path), pdf=str(pdf_path), profile=profile.get("profile_id"), strict=False, format="json", ) with patch("iftypeset.qa.subprocess.run", side_effect=fake_run): rc = cli_mod._cmd_qa(args) self.assertEqual(rc, 0) layout = json.loads((out_dir / "layout-report.json").read_text(encoding="utf-8")) self.assertEqual(layout.get("analysis_mode"), "html+pdf") self.assertGreaterEqual(layout["metrics"].get("max_widows_per_10_pages", 0), 1) self.assertGreaterEqual(layout["metrics"].get("max_orphans_per_10_pages", 0), 1) self.assertIn("max_runt_final_page", layout.get("metrics", {})) kinds = {i.get("kind") for i in layout.get("incidents", [])} self.assertIn("widow_pdf", kinds) self.assertIn("orphan_pdf", kinds) def test_qa_sarif_output(self) -> None: spec = load_spec(ROOT / "spec") profile = spec.profiles["web_pdf"] html_text = "

Link https://example.com/this/is/a/very/long/url/that/should/trigger/qa

" with tempfile.TemporaryDirectory() as tmp: out_dir = Path(tmp) html_path = out_dir / "render.html" html_path.write_text(html_text, encoding="utf-8") args = argparse.Namespace( spec=str(ROOT / "spec"), out=str(out_dir), html=str(html_path), pdf=str(out_dir / "render.pdf"), profile=profile.get("profile_id"), strict=False, format="sarif", ) rc = cli_mod._cmd_qa(args) self.assertIn(rc, (0, 1)) sarif_path = out_dir / "qa-report.sarif" self.assertTrue(sarif_path.exists()) payload = json.loads(sarif_path.read_text(encoding="utf-8")) self.assertIn("runs", payload) if __name__ == "__main__": unittest.main()