iftypeset/tests/test_qa.py

import argparse
import json
import subprocess
import sys
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch

ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
sys.path.insert(0, str(SRC))

from iftypeset import cli as cli_mod  # noqa: E402
from iftypeset.qa import (
    _detect_caption_separation_pdf,
    _detect_heading_proximity_pdf,
    _detect_list_breaks_pdf,
    _detect_pdf_bbox_overflow,
    _detect_pdf_overfull_lines,
    _detect_runt_final_page,
    _detect_stranded_headings_pdf,
    _detect_widows_orphans,
    analyze_html,
    evaluate_gates,
)  # noqa: E402
from iftypeset.spec_loader import load_spec  # noqa: E402


class QATests(unittest.TestCase):
    def test_analyze_html_metrics(self) -> None:
        spec = load_spec(ROOT / "spec")
        profile = spec.profiles["web_pdf"]
        html_text = """
<!doctype html>
<html><body>
<h1>1 Intro</h1>
<h2>1.2 Skip</h2>
<p>Paragraph with a very long URL <a href="https://example.com/this/is/a/very/long/url/that/should/trigger/and/keep/going/without/breaks">https://example.com/this/is/a/very/long/url/that/should/trigger/and/keep/going/without/breaks</a></p>
<pre><code>aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</code></pre>
</body></html>
"""
        report = analyze_html(html_text, profile)
        self.assertIn("max_link_wrap_incidents", report.metrics)
        self.assertGreaterEqual(report.metrics["max_link_wrap_incidents"], 1)

    def test_analyze_html_finds_bare_links_overfull_and_tables(self) -> None:
        spec = load_spec(ROOT / "spec")
        profile = spec.profiles["web_pdf"]
        html_text = """
<!doctype html>
<html><body>
<p>Long bare URL https://example.com/this/is/an/extraordinarily/long/url/with/no/breaks/that/should/trigger/qa</p>
<p>Overfull token aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa</p>
<pre><code>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb</code></pre>
<table><tr><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th><th>H</th><th>I</th></tr></table>
</body></html>
"""
        report = analyze_html(html_text, profile)
        self.assertGreaterEqual(report.metrics.get("max_link_wrap_incidents", 0), 1)
        self.assertGreaterEqual(report.metrics.get("max_overfull_lines", 0), 1)
        self.assertGreaterEqual(report.metrics.get("max_code_overflow_incidents", 0), 1)
        self.assertGreaterEqual(report.metrics.get("max_table_overflow_incidents", 0), 1)

        kinds = {i.kind for i in report.incidents}
        self.assertIn("link_wrap", kinds)
        self.assertIn("overfull_token", kinds)
        self.assertIn("code_overflow", kinds)
        self.assertIn("table_overflow", kinds)

    def test_gate_evaluation(self) -> None:
        metrics = {"max_link_wrap_incidents": 3}
        gates = {"max_link_wrap_incidents": 1}
        result = evaluate_gates(metrics, gates, profile_id="web_pdf", strict=False)
        self.assertFalse(result["ok"])

    def test_detect_widows_orphans_from_pages(self) -> None:
        pages = [
            "Intro line\n\nOrphan line",
            "Widow line\n\nNext paragraph line 1\nNext paragraph line 2",
        ]
        widows, orphans, incidents = _detect_widows_orphans(pages)
        self.assertEqual(orphans[0], 1)
        self.assertEqual(widows[1], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("orphan_pdf", kinds)
        self.assertIn("widow_pdf", kinds)

    def test_detect_stranded_headings_pdf(self) -> None:
        pages = [
            "Body text line\n\nHeading Near End\n",
            "Next page line 1\nNext page line 2\n",
        ]
        counts, incidents = _detect_stranded_headings_pdf(pages, keep_lines=2, heading_max_chars=40)
        self.assertEqual(counts[0], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("stranded_heading_pdf", kinds)

    def test_detect_pdf_overfull_lines(self) -> None:
        pages = [
            "Short line\n" + ("A" * 80),
            "Another page\n",
        ]
        counts, incidents = _detect_pdf_overfull_lines(pages, max_chars=50)
        self.assertEqual(counts[0], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("overfull_line_pdf", kinds)

    def test_detect_pdf_bbox_overflow(self) -> None:
        pages = [
            {
                "number": 1,
                "width": 100.0,
                "height": 100.0,
                "texts": [{"left": 10, "top": 10, "width": 20, "height": 10, "text": "OK"}],
            },
            {
                "number": 2,
                "width": 100.0,
                "height": 100.0,
                "texts": [{"left": 95, "top": 10, "width": 10, "height": 10, "text": "Overflow"}],
            },
        ]
        counts, incidents = _detect_pdf_bbox_overflow(pages, tolerance=1.0)
        self.assertEqual(counts[0], 0)
        self.assertEqual(counts[1], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("overfull_bbox_pdf", kinds)

    def test_detect_caption_separation_pdf(self) -> None:
        pages = [
            "Figure 1. Caption at top\nMore text line\n",
            "Body text\n",
        ]
        counts, incidents = _detect_caption_separation_pdf(pages)
        self.assertEqual(counts[0], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("caption_separation_pdf", kinds)

    def test_detect_list_breaks_pdf(self) -> None:
        pages = [
            "We consider the following:\n",
            "1. First item\n2. Second item\n",
        ]
        counts, incidents = _detect_list_breaks_pdf(pages)
        self.assertEqual(counts[0], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("list_intro_separated_pdf", kinds)

        pages = [
            "Body text\n\n- Only item\n",
        ]
        counts, incidents = _detect_list_breaks_pdf(pages)
        self.assertEqual(counts[0], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("list_lonely_item_pdf", kinds)

    def test_detect_heading_proximity_pdf(self) -> None:
        pages = [
            "Heading One\n\nHeading Two\n\nBody text\n",
            "Next page\n",
        ]
        counts, incidents = _detect_heading_proximity_pdf(pages, heading_max_chars=40)
        self.assertEqual(counts[0], 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("heading_proximity_pdf", kinds)

    def test_detect_runt_final_page_pdf(self) -> None:
        pages = [
            "\n".join([f"Line {i}" for i in range(1, 16)]),
            "Last line\nSecond line\n",
        ]
        count, incidents = _detect_runt_final_page(pages, min_ratio=0.33, min_lines=4)
        self.assertEqual(count, 1)
        kinds = {i.kind for i in incidents}
        self.assertIn("runt_final_page_pdf", kinds)

    def test_qa_merges_pdf_metrics_when_present(self) -> None:
        spec = load_spec(ROOT / "spec")
        profile = spec.profiles["web_pdf"]
        pdf_text = "Intro line\n\nOrphan line.\fWidow line\n\nNext paragraph line"

        def fake_run(cmd, check, stdout, stderr):  # type: ignore[no-untyped-def]
            if cmd[0] == "pdfinfo":
                return subprocess.CompletedProcess(cmd, 0, stdout=b"Pages: 2\n", stderr=b"")
            if cmd[0] == "pdftotext":
                return subprocess.CompletedProcess(cmd, 0, stdout=pdf_text.encode("utf-8"), stderr=b"")
            if cmd[0] == "pdftohtml":
                xml = (
                    "<doc>"
                    "<page number=\"1\" width=\"100\" height=\"100\">"
                    "<text top=\"10\" left=\"10\" width=\"20\" height=\"10\">OK</text>"
                    "</page>"
                    "<page number=\"2\" width=\"100\" height=\"100\">"
                    "<text top=\"10\" left=\"10\" width=\"20\" height=\"10\">OK</text>"
                    "</page>"
                    "</doc>"
                )
                return subprocess.CompletedProcess(cmd, 0, stdout=xml.encode("utf-8"), stderr=b"")
            raise AssertionError(f"Unexpected command: {cmd}")

        with tempfile.TemporaryDirectory() as tmp:
            out_dir = Path(tmp)
            html_path = out_dir / "render.html"
            pdf_path = out_dir / "render.pdf"
            html_path.write_text("<!doctype html><html><body><p>Ok</p></body></html>", encoding="utf-8")
            pdf_path.write_bytes(b"%PDF-1.4\n%stub\n")

            args = argparse.Namespace(
                spec=str(ROOT / "spec"),
                out=str(out_dir),
                html=str(html_path),
                pdf=str(pdf_path),
                profile=profile.get("profile_id"),
                strict=False,
                format="json",
            )

            with patch("iftypeset.qa.subprocess.run", side_effect=fake_run):
                rc = cli_mod._cmd_qa(args)

            self.assertEqual(rc, 0)
            layout = json.loads((out_dir / "layout-report.json").read_text(encoding="utf-8"))
            self.assertEqual(layout.get("analysis_mode"), "html+pdf")
            self.assertGreaterEqual(layout["metrics"].get("max_widows_per_10_pages", 0), 1)
            self.assertGreaterEqual(layout["metrics"].get("max_orphans_per_10_pages", 0), 1)
            self.assertIn("max_runt_final_page", layout.get("metrics", {}))
            kinds = {i.get("kind") for i in layout.get("incidents", [])}
            self.assertIn("widow_pdf", kinds)
            self.assertIn("orphan_pdf", kinds)

    def test_qa_sarif_output(self) -> None:
        spec = load_spec(ROOT / "spec")
        profile = spec.profiles["web_pdf"]
        html_text = "<!doctype html><html><body><p>Link https://example.com/this/is/a/very/long/url/that/should/trigger/qa</p></body></html>"

        with tempfile.TemporaryDirectory() as tmp:
            out_dir = Path(tmp)
            html_path = out_dir / "render.html"
            html_path.write_text(html_text, encoding="utf-8")

            args = argparse.Namespace(
                spec=str(ROOT / "spec"),
                out=str(out_dir),
                html=str(html_path),
                pdf=str(out_dir / "render.pdf"),
                profile=profile.get("profile_id"),
                strict=False,
                format="sarif",
            )

            rc = cli_mod._cmd_qa(args)
            self.assertIn(rc, (0, 1))
            sarif_path = out_dir / "qa-report.sarif"
            self.assertTrue(sarif_path.exists())
            payload = json.loads(sarif_path.read_text(encoding="utf-8"))
            self.assertIn("runs", payload)


if __name__ == "__main__":
    unittest.main()