261 lines
10 KiB
Python
261 lines
10 KiB
Python
import argparse
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
SRC = ROOT / "src"
|
|
sys.path.insert(0, str(SRC))
|
|
|
|
from iftypeset import cli as cli_mod # noqa: E402
|
|
from iftypeset.qa import (
|
|
_detect_caption_separation_pdf,
|
|
_detect_heading_proximity_pdf,
|
|
_detect_list_breaks_pdf,
|
|
_detect_pdf_bbox_overflow,
|
|
_detect_pdf_overfull_lines,
|
|
_detect_runt_final_page,
|
|
_detect_stranded_headings_pdf,
|
|
_detect_widows_orphans,
|
|
analyze_html,
|
|
evaluate_gates,
|
|
) # noqa: E402
|
|
from iftypeset.spec_loader import load_spec # noqa: E402
|
|
|
|
|
|
class QATests(unittest.TestCase):
|
|
def test_analyze_html_metrics(self) -> None:
|
|
spec = load_spec(ROOT / "spec")
|
|
profile = spec.profiles["web_pdf"]
|
|
html_text = """
|
|
<!doctype html>
|
|
<html><body>
|
|
<h1>1 Intro</h1>
|
|
<h2>1.2 Skip</h2>
|
|
<p>Paragraph with a very long URL <a href="https://example.com/this/is/a/very/long/url/that/should/trigger/and/keep/going/without/breaks">https://example.com/this/is/a/very/long/url/that/should/trigger/and/keep/going/without/breaks</a></p>
|
|
<pre><code>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</code></pre>
|
|
</body></html>
|
|
"""
|
|
report = analyze_html(html_text, profile)
|
|
self.assertIn("max_link_wrap_incidents", report.metrics)
|
|
self.assertGreaterEqual(report.metrics["max_link_wrap_incidents"], 1)
|
|
|
|
def test_analyze_html_finds_bare_links_overfull_and_tables(self) -> None:
|
|
spec = load_spec(ROOT / "spec")
|
|
profile = spec.profiles["web_pdf"]
|
|
html_text = """
|
|
<!doctype html>
|
|
<html><body>
|
|
<p>Long bare URL https://example.com/this/is/an/extraordinarily/long/url/with/no/breaks/that/should/trigger/qa</p>
|
|
<p>Overfull token aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</p>
|
|
<pre><code>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb</code></pre>
|
|
<table><tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th><th>H</th><th>I</th></tr></table>
|
|
</body></html>
|
|
"""
|
|
report = analyze_html(html_text, profile)
|
|
self.assertGreaterEqual(report.metrics.get("max_link_wrap_incidents", 0), 1)
|
|
self.assertGreaterEqual(report.metrics.get("max_overfull_lines", 0), 1)
|
|
self.assertGreaterEqual(report.metrics.get("max_code_overflow_incidents", 0), 1)
|
|
self.assertGreaterEqual(report.metrics.get("max_table_overflow_incidents", 0), 1)
|
|
|
|
kinds = {i.kind for i in report.incidents}
|
|
self.assertIn("link_wrap", kinds)
|
|
self.assertIn("overfull_token", kinds)
|
|
self.assertIn("code_overflow", kinds)
|
|
self.assertIn("table_overflow", kinds)
|
|
|
|
def test_gate_evaluation(self) -> None:
|
|
metrics = {"max_link_wrap_incidents": 3}
|
|
gates = {"max_link_wrap_incidents": 1}
|
|
result = evaluate_gates(metrics, gates, profile_id="web_pdf", strict=False)
|
|
self.assertFalse(result["ok"])
|
|
|
|
def test_detect_widows_orphans_from_pages(self) -> None:
|
|
pages = [
|
|
"Intro line\n\nOrphan line",
|
|
"Widow line\n\nNext paragraph line 1\nNext paragraph line 2",
|
|
]
|
|
widows, orphans, incidents = _detect_widows_orphans(pages)
|
|
self.assertEqual(orphans[0], 1)
|
|
self.assertEqual(widows[1], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("orphan_pdf", kinds)
|
|
self.assertIn("widow_pdf", kinds)
|
|
|
|
def test_detect_stranded_headings_pdf(self) -> None:
|
|
pages = [
|
|
"Body text line\n\nHeading Near End\n",
|
|
"Next page line 1\nNext page line 2\n",
|
|
]
|
|
counts, incidents = _detect_stranded_headings_pdf(pages, keep_lines=2, heading_max_chars=40)
|
|
self.assertEqual(counts[0], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("stranded_heading_pdf", kinds)
|
|
|
|
def test_detect_pdf_overfull_lines(self) -> None:
|
|
pages = [
|
|
"Short line\n" + ("A" * 80),
|
|
"Another page\n",
|
|
]
|
|
counts, incidents = _detect_pdf_overfull_lines(pages, max_chars=50)
|
|
self.assertEqual(counts[0], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("overfull_line_pdf", kinds)
|
|
|
|
def test_detect_pdf_bbox_overflow(self) -> None:
|
|
pages = [
|
|
{
|
|
"number": 1,
|
|
"width": 100.0,
|
|
"height": 100.0,
|
|
"texts": [{"left": 10, "top": 10, "width": 20, "height": 10, "text": "OK"}],
|
|
},
|
|
{
|
|
"number": 2,
|
|
"width": 100.0,
|
|
"height": 100.0,
|
|
"texts": [{"left": 95, "top": 10, "width": 10, "height": 10, "text": "Overflow"}],
|
|
},
|
|
]
|
|
counts, incidents = _detect_pdf_bbox_overflow(pages, tolerance=1.0)
|
|
self.assertEqual(counts[0], 0)
|
|
self.assertEqual(counts[1], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("overfull_bbox_pdf", kinds)
|
|
|
|
def test_detect_caption_separation_pdf(self) -> None:
|
|
pages = [
|
|
"Figure 1. Caption at top\nMore text line\n",
|
|
"Body text\n",
|
|
]
|
|
counts, incidents = _detect_caption_separation_pdf(pages)
|
|
self.assertEqual(counts[0], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("caption_separation_pdf", kinds)
|
|
|
|
def test_detect_list_breaks_pdf(self) -> None:
|
|
pages = [
|
|
"We consider the following:\n",
|
|
"1. First item\n2. Second item\n",
|
|
]
|
|
counts, incidents = _detect_list_breaks_pdf(pages)
|
|
self.assertEqual(counts[0], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("list_intro_separated_pdf", kinds)
|
|
|
|
pages = [
|
|
"Body text\n\n- Only item\n",
|
|
]
|
|
counts, incidents = _detect_list_breaks_pdf(pages)
|
|
self.assertEqual(counts[0], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("list_lonely_item_pdf", kinds)
|
|
|
|
def test_detect_heading_proximity_pdf(self) -> None:
|
|
pages = [
|
|
"Heading One\n\nHeading Two\n\nBody text\n",
|
|
"Next page\n",
|
|
]
|
|
counts, incidents = _detect_heading_proximity_pdf(pages, heading_max_chars=40)
|
|
self.assertEqual(counts[0], 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("heading_proximity_pdf", kinds)
|
|
|
|
def test_detect_runt_final_page_pdf(self) -> None:
|
|
pages = [
|
|
"\n".join([f"Line {i}" for i in range(1, 16)]),
|
|
"Last line\nSecond line\n",
|
|
]
|
|
count, incidents = _detect_runt_final_page(pages, min_ratio=0.33, min_lines=4)
|
|
self.assertEqual(count, 1)
|
|
kinds = {i.kind for i in incidents}
|
|
self.assertIn("runt_final_page_pdf", kinds)
|
|
|
|
def test_qa_merges_pdf_metrics_when_present(self) -> None:
|
|
spec = load_spec(ROOT / "spec")
|
|
profile = spec.profiles["web_pdf"]
|
|
pdf_text = "Intro line\n\nOrphan line.\fWidow line\n\nNext paragraph line"
|
|
|
|
def fake_run(cmd, check, stdout, stderr): # type: ignore[no-untyped-def]
|
|
if cmd[0] == "pdfinfo":
|
|
return subprocess.CompletedProcess(cmd, 0, stdout=b"Pages: 2\n", stderr=b"")
|
|
if cmd[0] == "pdftotext":
|
|
return subprocess.CompletedProcess(cmd, 0, stdout=pdf_text.encode("utf-8"), stderr=b"")
|
|
if cmd[0] == "pdftohtml":
|
|
xml = (
|
|
"<doc>"
|
|
"<page number=\"1\" width=\"100\" height=\"100\">"
|
|
"<text top=\"10\" left=\"10\" width=\"20\" height=\"10\">OK</text>"
|
|
"</page>"
|
|
"<page number=\"2\" width=\"100\" height=\"100\">"
|
|
"<text top=\"10\" left=\"10\" width=\"20\" height=\"10\">OK</text>"
|
|
"</page>"
|
|
"</doc>"
|
|
)
|
|
return subprocess.CompletedProcess(cmd, 0, stdout=xml.encode("utf-8"), stderr=b"")
|
|
raise AssertionError(f"Unexpected command: {cmd}")
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
out_dir = Path(tmp)
|
|
html_path = out_dir / "render.html"
|
|
pdf_path = out_dir / "render.pdf"
|
|
html_path.write_text("<!doctype html><html><body><p>Ok</p></body></html>", encoding="utf-8")
|
|
pdf_path.write_bytes(b"%PDF-1.4\n%stub\n")
|
|
|
|
args = argparse.Namespace(
|
|
spec=str(ROOT / "spec"),
|
|
out=str(out_dir),
|
|
html=str(html_path),
|
|
pdf=str(pdf_path),
|
|
profile=profile.get("profile_id"),
|
|
strict=False,
|
|
format="json",
|
|
)
|
|
|
|
with patch("iftypeset.qa.subprocess.run", side_effect=fake_run):
|
|
rc = cli_mod._cmd_qa(args)
|
|
|
|
self.assertEqual(rc, 0)
|
|
layout = json.loads((out_dir / "layout-report.json").read_text(encoding="utf-8"))
|
|
self.assertEqual(layout.get("analysis_mode"), "html+pdf")
|
|
self.assertGreaterEqual(layout["metrics"].get("max_widows_per_10_pages", 0), 1)
|
|
self.assertGreaterEqual(layout["metrics"].get("max_orphans_per_10_pages", 0), 1)
|
|
self.assertIn("max_runt_final_page", layout.get("metrics", {}))
|
|
kinds = {i.get("kind") for i in layout.get("incidents", [])}
|
|
self.assertIn("widow_pdf", kinds)
|
|
self.assertIn("orphan_pdf", kinds)
|
|
|
|
def test_qa_sarif_output(self) -> None:
|
|
spec = load_spec(ROOT / "spec")
|
|
profile = spec.profiles["web_pdf"]
|
|
html_text = "<!doctype html><html><body><p>Link https://example.com/this/is/a/very/long/url/that/should/trigger/qa</p></body></html>"
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
out_dir = Path(tmp)
|
|
html_path = out_dir / "render.html"
|
|
html_path.write_text(html_text, encoding="utf-8")
|
|
|
|
args = argparse.Namespace(
|
|
spec=str(ROOT / "spec"),
|
|
out=str(out_dir),
|
|
html=str(html_path),
|
|
pdf=str(out_dir / "render.pdf"),
|
|
profile=profile.get("profile_id"),
|
|
strict=False,
|
|
format="sarif",
|
|
)
|
|
|
|
rc = cli_mod._cmd_qa(args)
|
|
self.assertIn(rc, (0, 1))
|
|
sarif_path = out_dir / "qa-report.sarif"
|
|
self.assertTrue(sarif_path.exists())
|
|
payload = json.loads(sarif_path.read_text(encoding="utf-8"))
|
|
self.assertIn("runs", payload)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|