re-voice/src/revoice/lint.py

from __future__ import annotations

import hashlib
import json
import re


_EMOJI_RE = re.compile(
    "["  # best-effort emoji detection (not perfect)
    "\U0001F300-\U0001FAFF"  # misc symbols & pictographs + extended
    "\u2600-\u27BF"  # dingbats / misc symbols
    "]+"
)


def lint_markdown(*, style_id: str, markdown: str) -> list[str]:
    require_mermaid = style_id.lower() in {
        "if.dave.v1.2",
        "if.dave.v1.3",
        "if.dave.v1.6",
        "if.dave.v1.7",
        "if.dave.v1.8",
        "if.dave.v1.9",
        "if.dave.v2.0",
        "if.dave.v2.1",
        "if.dave.v2.2",
        "if.dave.v2.3",
        "if.dave.fr.v1.2",
        "if.dave.fr.v1.3",
        "dave",
        "if://bible/dave/v1.2",
        "if://bible/dave/v1.3",
        "if://bible/dave/v1.6",
        "if://bible/dave/v1.7",
        "if://bible/dave/v1.8",
        "if://bible/dave/v1.9",
        "if://bible/dave/v2.0",
        "if://bible/dave/v2.1",
        "if://bible/dave/v2.2",
        "if://bible/dave/v2.3",
        "if://bible/dave/fr/v1.2",
        "if://bible/dave/fr/v1.3",
    }
    min_mermaid = (
        2
        if style_id.lower()
        in {
            "if.dave.v1.7",
            "if://bible/dave/v1.7",
            "if.dave.v1.8",
            "if://bible/dave/v1.8",
            "if.dave.v1.9",
            "if://bible/dave/v1.9",
            "if.dave.v2.0",
            "if://bible/dave/v2.0",
            "if.dave.v2.1",
            "if://bible/dave/v2.1",
            "if.dave.v2.2",
            "if://bible/dave/v2.2",
            "if.dave.v2.3",
            "if://bible/dave/v2.3",
        }
        else (1 if require_mermaid else 0)
    )
    if style_id.lower() in {
        "if.dave.v1",
        "if.dave.v1.1",
        "if.dave.v1.2",
        "if.dave.v1.3",
        "if.dave.v1.6",
        "if.dave.v1.7",
        "if.dave.v1.8",
        "if.dave.v1.9",
        "if.dave.v2.0",
        "if.dave.v2.1",
        "if.dave.v2.2",
        "if.dave.v2.3",
        "if.dave.fr.v1.2",
        "if.dave.fr.v1.3",
        "dave",
        "if://bible/dave/v1.0",
        "if://bible/dave/v1.1",
        "if://bible/dave/v1.2",
        "if://bible/dave/v1.3",
        "if://bible/dave/v1.6",
        "if://bible/dave/v1.7",
        "if://bible/dave/v1.8",
        "if://bible/dave/v1.9",
        "if://bible/dave/v2.0",
        "if://bible/dave/v2.1",
        "if://bible/dave/v2.2",
        "if://bible/dave/v2.3",
        "if://bible/dave/fr/v1.2",
        "if://bible/dave/fr/v1.3",
    }:
        return _lint_dave(markdown, source_text=None, min_mermaid=min_mermaid)
    return [f"Unknown style id: {style_id}"]


def lint_markdown_with_source(*, style_id: str, markdown: str, source_text: str) -> list[str]:
    require_mermaid = style_id.lower() in {
        "if.dave.v1.2",
        "if.dave.v1.3",
        "if.dave.v1.6",
        "if.dave.v1.7",
        "if.dave.v1.8",
        "if.dave.v1.9",
        "if.dave.v2.0",
        "if.dave.v2.1",
        "if.dave.v2.2",
        "if.dave.v2.3",
        "if.dave.fr.v1.2",
        "if.dave.fr.v1.3",
        "dave",
        "if://bible/dave/v1.2",
        "if://bible/dave/v1.3",
        "if://bible/dave/v1.6",
        "if://bible/dave/v1.7",
        "if://bible/dave/v1.8",
        "if://bible/dave/v1.9",
        "if://bible/dave/v2.0",
        "if://bible/dave/v2.1",
        "if://bible/dave/v2.2",
        "if://bible/dave/v2.3",
        "if://bible/dave/fr/v1.2",
        "if://bible/dave/fr/v1.3",
    }
    min_mermaid = (
        2
        if style_id.lower()
        in {
            "if.dave.v1.7",
            "if://bible/dave/v1.7",
            "if.dave.v1.8",
            "if://bible/dave/v1.8",
            "if.dave.v1.9",
            "if://bible/dave/v1.9",
            "if.dave.v2.0",
            "if://bible/dave/v2.0",
            "if.dave.v2.1",
            "if://bible/dave/v2.1",
            "if.dave.v2.2",
            "if://bible/dave/v2.2",
            "if.dave.v2.3",
            "if://bible/dave/v2.3",
        }
        else (1 if require_mermaid else 0)
    )
    if style_id.lower() in {
        "if.dave.v1",
        "if.dave.v1.1",
        "if.dave.v1.2",
        "if.dave.v1.3",
        "if.dave.v1.6",
        "if.dave.v1.7",
        "if.dave.v1.8",
        "if.dave.v1.9",
        "if.dave.v2.0",
        "if.dave.v2.1",
        "if.dave.v2.2",
        "if.dave.v2.3",
        "if.dave.fr.v1.2",
        "if.dave.fr.v1.3",
        "dave",
        "if://bible/dave/v1.0",
        "if://bible/dave/v1.1",
        "if://bible/dave/v1.2",
        "if://bible/dave/v1.3",
        "if://bible/dave/v1.6",
        "if://bible/dave/v1.7",
        "if://bible/dave/v1.8",
        "if://bible/dave/v1.9",
        "if://bible/dave/v2.0",
        "if://bible/dave/v2.1",
        "if://bible/dave/v2.2",
        "if://bible/dave/v2.3",
        "if://bible/dave/fr/v1.2",
        "if://bible/dave/fr/v1.3",
    }:
        return _lint_dave(markdown, source_text=source_text, min_mermaid=min_mermaid)
    return [f"Unknown style id: {style_id}"]


def _lint_dave(md: str, *, source_text: str | None, min_mermaid: int) -> list[str]:
    issues: list[str] = []

    if "InfraFabric Red Team Footer" not in md:
        issues.append("Missing required footer: InfraFabric Red Team Footer")

    if "Standard Dave Footer" not in md:
        issues.append("Missing required footer: Standard Dave Footer")

    allowed_emojis = set(_EMOJI_RE.findall(source_text or ""))
    present_emojis = set(_EMOJI_RE.findall(md))
    disallowed = sorted(present_emojis - allowed_emojis)
    if disallowed:
        issues.append(
            "Contains emoji not present in source: " + " ".join(disallowed[:10]) + (" ..." if len(disallowed) > 10 else "")
        )

    mermaid_count = len(re.findall(r"```mermaid\b", md))
    if min_mermaid and mermaid_count < min_mermaid:
        issues.append(f"Missing required Mermaid diagrams: expected>={min_mermaid} got={mermaid_count}")

    issues.extend(_lint_duplicate_mermaid(md))
    issues.extend(_lint_duplicate_dave_factor(md))
    issues.extend(_lint_json_blocks(md))
    issues.extend(_lint_repeated_lines(md))

    return issues


def _stable_hash(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()


def _lint_duplicate_mermaid(md: str) -> list[str]:
    blocks = re.findall(r"```mermaid\s*([\s\S]*?)```", md, flags=re.MULTILINE)
    if len(blocks) <= 1:
        return []

    counts: dict[str, int] = {}
    for block in blocks:
        normalized = "\n".join([ln.rstrip() for ln in str(block).strip().splitlines() if ln.strip()])
        if not normalized:
            continue
        h = _stable_hash(normalized)
        counts[h] = counts.get(h, 0) + 1

    issues: list[str] = []
    for h, n in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])):
        if n > 1:
            issues.append(f"Duplicate Mermaid diagram appears {n} times (sha256:{h[:12]})")
    return issues


def _lint_duplicate_dave_factor(md: str) -> list[str]:
    lines = md.splitlines()
    blocks: list[str] = []
    i = 0
    while i < len(lines):
        line = lines[i].rstrip()
        if line.startswith("> **The Dave Factor:**"):
            block_lines = [line.strip()]
            if i + 1 < len(lines) and lines[i + 1].rstrip().startswith("> **Countermeasure:**"):
                block_lines.append(lines[i + 1].strip())
                i += 1
            blocks.append("\n".join(block_lines))
        i += 1

    if len(blocks) <= 1:
        return []

    counts: dict[str, int] = {}
    for block in blocks:
        h = _stable_hash(block.strip())
        counts[h] = counts.get(h, 0) + 1

    issues: list[str] = []
    for h, n in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])):
        if n > 1:
            issues.append(f"Duplicate Dave Factor callout appears {n} times (sha256:{h[:12]})")
    return issues


def _lint_json_blocks(md: str) -> list[str]:
    blocks = re.findall(r"```json\s*([\s\S]*?)```", md, flags=re.MULTILINE)
    issues: list[str] = []
    for idx, raw in enumerate(blocks, 1):
        text = str(raw).strip()
        if not text:
            continue
        try:
            json.loads(text)
        except Exception as e:
            issues.append(f"Invalid JSON code block #{idx}: {e}")
    return issues


def _lint_repeated_lines(md: str) -> list[str]:
    lines = md.splitlines()
    counts: dict[str, int] = {}

    in_fence = False
    fence = None
    for ln in lines:
        stripped = ln.strip()
        if stripped.startswith("```"):
            if not in_fence:
                in_fence = True
                fence = stripped
            else:
                in_fence = False
                fence = None
            continue
        if in_fence:
            continue
        if not stripped:
            continue
        if stripped.startswith("#"):
            continue
        if stripped.startswith(">"):
            continue
        # Action Pack backlog uses consistent acceptance criteria by design.
        if stripped.startswith("- Acceptance:"):
            continue
        if len(stripped) < 18:
            continue
        counts[stripped] = counts.get(stripped, 0) + 1

    issues: list[str] = []
    for line, n in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])):
        if n >= 3:
            issues.append(f"Repeated line appears {n} times: {line[:120]}{'…' if len(line) > 120 else ''}")
    return issues