iftypeset/tests/test_qa.py
codex e92f1c3b93
Some checks are pending
ci / ci (push) Waiting to run
iftypeset: document CI pipeline + Playwright + font contract
2026-01-08 18:10:41 +00:00

261 lines
10 KiB
Python

import argparse
import json
import subprocess
import sys
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
sys.path.insert(0, str(SRC))
from iftypeset import cli as cli_mod # noqa: E402
from iftypeset.qa import (
_detect_caption_separation_pdf,
_detect_heading_proximity_pdf,
_detect_list_breaks_pdf,
_detect_pdf_bbox_overflow,
_detect_pdf_overfull_lines,
_detect_runt_final_page,
_detect_stranded_headings_pdf,
_detect_widows_orphans,
analyze_html,
evaluate_gates,
) # noqa: E402
from iftypeset.spec_loader import load_spec # noqa: E402
class QATests(unittest.TestCase):
def test_analyze_html_metrics(self) -> None:
spec = load_spec(ROOT / "spec")
profile = spec.profiles["web_pdf"]
html_text = """
<!doctype html>
<html><body>
<h1>1 Intro</h1>
<h2>1.2 Skip</h2>
<p>Paragraph with a very long URL <a href="https://example.com/this/is/a/very/long/url/that/should/trigger/and/keep/going/without/breaks">https://example.com/this/is/a/very/long/url/that/should/trigger/and/keep/going/without/breaks</a></p>
<pre><code>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</code></pre>
</body></html>
"""
report = analyze_html(html_text, profile)
self.assertIn("max_link_wrap_incidents", report.metrics)
self.assertGreaterEqual(report.metrics["max_link_wrap_incidents"], 1)
def test_analyze_html_finds_bare_links_overfull_and_tables(self) -> None:
spec = load_spec(ROOT / "spec")
profile = spec.profiles["web_pdf"]
html_text = """
<!doctype html>
<html><body>
<p>Long bare URL https://example.com/this/is/an/extraordinarily/long/url/with/no/breaks/that/should/trigger/qa</p>
<p>Overfull token aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</p>
<pre><code>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb</code></pre>
<table><tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th><th>H</th><th>I</th></tr></table>
</body></html>
"""
report = analyze_html(html_text, profile)
self.assertGreaterEqual(report.metrics.get("max_link_wrap_incidents", 0), 1)
self.assertGreaterEqual(report.metrics.get("max_overfull_lines", 0), 1)
self.assertGreaterEqual(report.metrics.get("max_code_overflow_incidents", 0), 1)
self.assertGreaterEqual(report.metrics.get("max_table_overflow_incidents", 0), 1)
kinds = {i.kind for i in report.incidents}
self.assertIn("link_wrap", kinds)
self.assertIn("overfull_token", kinds)
self.assertIn("code_overflow", kinds)
self.assertIn("table_overflow", kinds)
def test_gate_evaluation(self) -> None:
metrics = {"max_link_wrap_incidents": 3}
gates = {"max_link_wrap_incidents": 1}
result = evaluate_gates(metrics, gates, profile_id="web_pdf", strict=False)
self.assertFalse(result["ok"])
def test_detect_widows_orphans_from_pages(self) -> None:
pages = [
"Intro line\n\nOrphan line",
"Widow line\n\nNext paragraph line 1\nNext paragraph line 2",
]
widows, orphans, incidents = _detect_widows_orphans(pages)
self.assertEqual(orphans[0], 1)
self.assertEqual(widows[1], 1)
kinds = {i.kind for i in incidents}
self.assertIn("orphan_pdf", kinds)
self.assertIn("widow_pdf", kinds)
def test_detect_stranded_headings_pdf(self) -> None:
pages = [
"Body text line\n\nHeading Near End\n",
"Next page line 1\nNext page line 2\n",
]
counts, incidents = _detect_stranded_headings_pdf(pages, keep_lines=2, heading_max_chars=40)
self.assertEqual(counts[0], 1)
kinds = {i.kind for i in incidents}
self.assertIn("stranded_heading_pdf", kinds)
def test_detect_pdf_overfull_lines(self) -> None:
pages = [
"Short line\n" + ("A" * 80),
"Another page\n",
]
counts, incidents = _detect_pdf_overfull_lines(pages, max_chars=50)
self.assertEqual(counts[0], 1)
kinds = {i.kind for i in incidents}
self.assertIn("overfull_line_pdf", kinds)
def test_detect_pdf_bbox_overflow(self) -> None:
pages = [
{
"number": 1,
"width": 100.0,
"height": 100.0,
"texts": [{"left": 10, "top": 10, "width": 20, "height": 10, "text": "OK"}],
},
{
"number": 2,
"width": 100.0,
"height": 100.0,
"texts": [{"left": 95, "top": 10, "width": 10, "height": 10, "text": "Overflow"}],
},
]
counts, incidents = _detect_pdf_bbox_overflow(pages, tolerance=1.0)
self.assertEqual(counts[0], 0)
self.assertEqual(counts[1], 1)
kinds = {i.kind for i in incidents}
self.assertIn("overfull_bbox_pdf", kinds)
def test_detect_caption_separation_pdf(self) -> None:
pages = [
"Figure 1. Caption at top\nMore text line\n",
"Body text\n",
]
counts, incidents = _detect_caption_separation_pdf(pages)
self.assertEqual(counts[0], 1)
kinds = {i.kind for i in incidents}
self.assertIn("caption_separation_pdf", kinds)
def test_detect_list_breaks_pdf(self) -> None:
pages = [
"We consider the following:\n",
"1. First item\n2. Second item\n",
]
counts, incidents = _detect_list_breaks_pdf(pages)
self.assertEqual(counts[0], 1)
kinds = {i.kind for i in incidents}
self.assertIn("list_intro_separated_pdf", kinds)
pages = [
"Body text\n\n- Only item\n",
]
counts, incidents = _detect_list_breaks_pdf(pages)
self.assertEqual(counts[0], 1)
kinds = {i.kind for i in incidents}
self.assertIn("list_lonely_item_pdf", kinds)
def test_detect_heading_proximity_pdf(self) -> None:
pages = [
"Heading One\n\nHeading Two\n\nBody text\n",
"Next page\n",
]
counts, incidents = _detect_heading_proximity_pdf(pages, heading_max_chars=40)
self.assertEqual(counts[0], 1)
kinds = {i.kind for i in incidents}
self.assertIn("heading_proximity_pdf", kinds)
def test_detect_runt_final_page_pdf(self) -> None:
pages = [
"\n".join([f"Line {i}" for i in range(1, 16)]),
"Last line\nSecond line\n",
]
count, incidents = _detect_runt_final_page(pages, min_ratio=0.33, min_lines=4)
self.assertEqual(count, 1)
kinds = {i.kind for i in incidents}
self.assertIn("runt_final_page_pdf", kinds)
def test_qa_merges_pdf_metrics_when_present(self) -> None:
spec = load_spec(ROOT / "spec")
profile = spec.profiles["web_pdf"]
pdf_text = "Intro line\n\nOrphan line.\fWidow line\n\nNext paragraph line"
def fake_run(cmd, check, stdout, stderr): # type: ignore[no-untyped-def]
if cmd[0] == "pdfinfo":
return subprocess.CompletedProcess(cmd, 0, stdout=b"Pages: 2\n", stderr=b"")
if cmd[0] == "pdftotext":
return subprocess.CompletedProcess(cmd, 0, stdout=pdf_text.encode("utf-8"), stderr=b"")
if cmd[0] == "pdftohtml":
xml = (
"<doc>"
"<page number=\"1\" width=\"100\" height=\"100\">"
"<text top=\"10\" left=\"10\" width=\"20\" height=\"10\">OK</text>"
"</page>"
"<page number=\"2\" width=\"100\" height=\"100\">"
"<text top=\"10\" left=\"10\" width=\"20\" height=\"10\">OK</text>"
"</page>"
"</doc>"
)
return subprocess.CompletedProcess(cmd, 0, stdout=xml.encode("utf-8"), stderr=b"")
raise AssertionError(f"Unexpected command: {cmd}")
with tempfile.TemporaryDirectory() as tmp:
out_dir = Path(tmp)
html_path = out_dir / "render.html"
pdf_path = out_dir / "render.pdf"
html_path.write_text("<!doctype html><html><body><p>Ok</p></body></html>", encoding="utf-8")
pdf_path.write_bytes(b"%PDF-1.4\n%stub\n")
args = argparse.Namespace(
spec=str(ROOT / "spec"),
out=str(out_dir),
html=str(html_path),
pdf=str(pdf_path),
profile=profile.get("profile_id"),
strict=False,
format="json",
)
with patch("iftypeset.qa.subprocess.run", side_effect=fake_run):
rc = cli_mod._cmd_qa(args)
self.assertEqual(rc, 0)
layout = json.loads((out_dir / "layout-report.json").read_text(encoding="utf-8"))
self.assertEqual(layout.get("analysis_mode"), "html+pdf")
self.assertGreaterEqual(layout["metrics"].get("max_widows_per_10_pages", 0), 1)
self.assertGreaterEqual(layout["metrics"].get("max_orphans_per_10_pages", 0), 1)
self.assertIn("max_runt_final_page", layout.get("metrics", {}))
kinds = {i.get("kind") for i in layout.get("incidents", [])}
self.assertIn("widow_pdf", kinds)
self.assertIn("orphan_pdf", kinds)
def test_qa_sarif_output(self) -> None:
spec = load_spec(ROOT / "spec")
profile = spec.profiles["web_pdf"]
html_text = "<!doctype html><html><body><p>Link https://example.com/this/is/a/very/long/url/that/should/trigger/qa</p></body></html>"
with tempfile.TemporaryDirectory() as tmp:
out_dir = Path(tmp)
html_path = out_dir / "render.html"
html_path.write_text(html_text, encoding="utf-8")
args = argparse.Namespace(
spec=str(ROOT / "spec"),
out=str(out_dir),
html=str(html_path),
pdf=str(out_dir / "render.pdf"),
profile=profile.get("profile_id"),
strict=False,
format="sarif",
)
rc = cli_mod._cmd_qa(args)
self.assertIn(rc, (0, 1))
sarif_path = out_dir / "qa-report.sarif"
self.assertTrue(sarif_path.exists())
payload = json.loads(sarif_path.read_text(encoding="utf-8"))
self.assertIn("runs", payload)
if __name__ == "__main__":
unittest.main()