--- FILE: spec/schema/rule.schema.json --- { "$schema": "[https://json-schema.org/draft/2020-12/schema](https://json-schema.org/draft/2020-12/schema)", "$id": "[https://example.invalid/pubstyle/spec/schema/rule.schema.json](https://example.invalid/pubstyle/spec/schema/rule.schema.json)", "title": "Publication-quality rule record", "type": "object", "additionalProperties": false, "required": [ "id", "title", "source_refs", "category", "severity", "applies_to", "rule_text", "rationale", "enforcement", "autofix", "autofix_notes", "tags", "keywords", "dependencies", "exceptions", "status" ], "properties": { "id": { "type": "string", "description": "Stable rule identifier. Prefix must be one of CMOS, BRING, HOUSE.", "minLength": 6, "maxLength": 120, "pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$" }, "title": { "type": "string", "description": "Short human-readable rule title.", "minLength": 4, "maxLength": 160 }, "source_refs": { "type": "array", "description": "Pointers back to sources. Must be pointers, not quotes. Prefer: "CMOS18 §X.Y pN" / "BRING §X.Y pN" / "HOUSE §X.Y pN".", "minItems": 1, "items": { "type": "string", "minLength": 8, "maxLength": 120, "pattern": "^(CMOS18|BRING|HOUSE)\s§[0-9A-Za-z][0-9A-Za-z.\-]*\s p[0-9ivxlcdmIVXLCDM]+(?:-[0-9ivxlcdmIVXLCDM]+)?(?:\s\(scan p[0-9]+\))?$" } }, "category": { "type": "string", "description": "Primary taxonomy bucket.", "enum": [ "editorial", "typography", "layout", "headings", "citations", "numbers", "punctuation", "abbreviations", "links", "tables", "figures", "code", "frontmatter", "backmatter", "accessibility", "i18n" ] }, "severity": { "type": "string", "description": "Normativity level. MUST blocks release unless downgraded by profile.", "enum": ["must", "should", "warn"] }, "applies_to": { "type": "string", "description": "Which pipeline stage(s) the rule targets.", "enum": ["md", "html", "pdf", "all"] }, "rule_text": { "type": "string", "description": "Paraphrased rule statement (no long quotes). If exact wording matters, note: "Exact wording required—refer to pointer".", "minLength": 10, "maxLength": 800 }, "rationale": { "type": "string", "description": "One-line rationale.", "minLength": 5, "maxLength": 200 }, "enforcement": { "type": "string", "description": "Primary enforcement mechanism.", "enum": ["lint", "typeset", "postrender", "manual"] }, "autofix": { "type": "string", "description": "Autofix capability, if any.", "enum": ["none", "rewrite", "reflow", "suggest"] }, "autofix_notes": { "type": "string", "description": "Notes describing what can be fixed and how/when. Keep short; never include book quotes.", "maxLength": 400 }, "tags": { "type": "array", "description": "Compact labels for routing/search/overrides (e.g., 'manual_checklist=true', 'widows_orphans', 'hyphenation').", "items": { "type": "string", "minLength": 1, "maxLength": 48, "pattern": "^[a-z0-9][a-z0-9_.:\-/]*(?:=[a-z0-9_.:\-/]+)?$" }, "maxItems": 64 }, "keywords": { "type": "array", "description": "Search keywords (human-oriented; not necessarily normalized).", "items": { "type": "string", "minLength": 2, "maxLength": 48 }, "maxItems": 64 }, "dependencies": { "type": "array", "description": "Rule IDs that should be applied/understood first.", "items": { "type": "string", "pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$" }, "maxItems": 32 }, "exceptions": { "type": "array", "description": "Free-text exceptions/caveats. Keep concise.", "items": { "type": "string", "minLength": 3, "maxLength": 240 }, "maxItems": 32 }, "examples_ref": { "type": "array", "description": "Optional references to separately stored examples (see spec/examples/README.md).", "items": { "type": "string", "minLength": 6, "maxLength": 80, "pattern": "^EX\.[A-Z0-9_]+\.[A-Z0-9_]+\.[0-9]{3,}$" }, "maxItems": 64 }, "implementation_notes": { "type": "string", "description": "Optional short notes for implementers (no quotes).", "minLength": 3, "maxLength": 600 }, "status": { "type": "string", "description": "Lifecycle state.", "enum": ["draft", "active", "deprecated"] } }, "allOf": [ { "if": { "properties": { "autofix": { "enum": ["rewrite", "reflow", "suggest"] } }, "required": ["autofix"] }, "then": { "properties": { "autofix_notes": { "minLength": 1 } } } } ] } --- END FILE --- --- FILE: spec/manifest.yaml --- version: "0.1.0" registry_id: "pubstyle" description: > Machine-readable style+typesetting rules for a Markdown→HTML→PDF pipeline, backed by primary references (Chicago / Bringhurst) and optional house rules. Rules are paraphrases only; sources are referenced by pointer strings. id_naming: prefixes: CMOS: "Editorial/style usage rules derived primarily from Chicago." BRING: "Typographic/layout rules derived primarily from Bringhurst." HOUSE: "Project-specific rules not directly sourced to Chicago/Bringhurst." pattern: "PREFIX.DOMAIN.TOPIC[.SUBTOPIC[.DETAIL...]]" delimiter: "." casing: "UPPER_SNAKE for segments" stability: rule_ids_are_immutable: true rename_policy: "Deprecate old id; introduce new id; keep mapping in report diffs." examples: - "CMOS.PUNCTUATION.DASHES.EM_DASH" - "BRING.LAYOUT.WIDOWS_ORPHANS.AVOID" - "HOUSE.CITATIONS.DOI.PREFER_HTTPS" source_pointer_scheme: goal: "Provide auditable traceability without reproducing sources." pointer_format_primary: "CMOS18 §
p" pointer_format_secondary: "BRING §
p" pointer_format_house: "HOUSE §
p" optional_scan_hint: "(scan p)" allowed_page_numbering: ["arabic", "roman"] notes: - "Pointers must be sufficient for a reader with the book to locate the guidance." - "Never store verbatim passages; paraphrase only." - "If a rule depends on exact wording, rule_text must say: 'Exact wording required—refer to pointer'." category_taxonomy: * editorial * typography * layout * headings * citations * numbers * punctuation * abbreviations * links * tables * figures * code * frontmatter * backmatter * accessibility * i18n profiles: * web_pdf * print_pdf * dense_tech * memo * slide_deck planned_rule_counts: target_total_range: [800, 1500] target_by_category: editorial: 120 typography: 170 layout: 140 headings: 70 citations: 140 numbers: 90 punctuation: 120 abbreviations: 60 links: 50 tables: 60 figures: 50 code: 70 frontmatter: 40 backmatter: 40 accessibility: 90 i18n: 60 coverage_contract: must_rules: enforceability_requirement: > Every MUST rule must be enforceable by at least one of: lint, typeset, postrender; otherwise it must be explicitly labeled as a manual checklist item and emitted in a checklist output artifact. manual_checklist_tag: "manual_checklist=true" checklist_artifact: "manual-checklist.md (and JSON mirror)" should_rules: policy: "Should rules should be enforceable when practical; otherwise allowed as manual with explicit rationale." warn_rules: policy: "Warnings may be non-blocking and advisory; still require source pointers." enforcement_definitions: lint: "Static analysis over normalized Markdown/HTML AST. Deterministic." typeset: "CSS/tokens shaping decisions prior to rendering (pagination, keeps, hyphenation parameters)." postrender: "PDF/HTML layout inspection (widows/orphans, overflow, keep failures, numbering mismatches)." manual: "Human review; system must still produce checklist items and traceability pointers." ci_guardrails: coverage_floor: must_implemented_min_percent: 95 overall_implemented_min_percent: 80 regression_rule: "CI fails if implemented coverage decreases from main branch." degraded_mode_contract: purpose: "Handle badly-structured inputs safely without crashing; still provide useful output." triggers: - "Markdown parse errors / invalid UTF-8" - "Missing heading hierarchy (no H1/H2 etc.)" - "Garbage extraction (e.g., line breaks every word, excessive hard wraps)" - "Mixed language with no lang metadata" behavior: normalize: attempt_repairs: - "Normalize whitespace and line endings" - "Detect and unwrap hard-wrapped paragraphs heuristically" - "Infer heading levels from patterns (e.g., '1.', '1.1', ALL CAPS lines) with low confidence" if_unrecoverable: - "Fall back to minimal AST: paragraphs + code blocks + raw spans" - "Mark document structure confidence = low" enforcement_in_degraded_mode: lint: run_subset: ["safety", "sanity", "catastrophic typography (double spaces, broken links)"] downgrade_some_must_to_warn: true typeset: use_fallback_tokens: true disable_aggressive_hyphenation: true postrender: run_core_gates_only: ["overfull_lines", "table_overflow_incidents", "code_overflow_incidents"] reporting: always_emit: - "layout-report.json" - "coverage-report.json" - "degraded-mode-report.json (what was inferred and why)" --- END FILE --- --- FILE: spec/profiles/web_pdf.yaml --- profile_id: "web_pdf" description: "Screen-first PDF for sharing and reading; conservative pagination and strong accessibility defaults." page: size: "Letter" orientation: "portrait" two_sided: false margins: top: "22mm" bottom: "22mm" inner: "20mm" outer: "20mm" fonts: body: family: ["Noto Serif", "STIX Two Text", "Times New Roman", "serif"] size: "11pt" line_height: 1.45 heading: family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"] mono: family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"] size: "10pt" line_height: 1.35 measure_targets: columns: 1 body_chars_per_line: min: 55 ideal: 66 max: 75 footnote_chars_per_line: min: 50 ideal: 60 max: 70 hyphenation: enabled: true strategy: "balanced" language_driven: true min_left: 2 min_right: 3 max_consecutive_hyphenated_lines: 2 avoid_proper_names_when_possible: true avoid_after_short_lines: true paragraphs: first_paragraph_indent: "0" indent: "1em" block_paragraph_spacing: "0.6em" headings: keep_with_next_lines: 2 avoid_stranded_headings: true numbering: enabled: true style: "decimal" require_monotonic_increase: true widows_orphans: widow_lines: 2 orphan_lines: 2 balance_facing_pages: false code: inline: use_mono: true block: font_size: "9.5pt" line_height: 1.35 wrap: true max_wrap_penalty: "medium" overflow_policy: "wrap_then_shrink_minor" shrink_limit: 0.92 tables: cell_padding: "3pt 6pt" header_repeat: true overflow_policy: "shrink_then_wrap" shrink_limit: 0.9 severity_overrides: * selector: { category: "layout", tag: "widows_orphans" } severity: "should" * selector: { category: "accessibility" } severity: "must" locale_defaults: primary_language: "en" fallback_languages: ["fr"] quotation_style: "us" # curly quotes, US punctuation conventions date_format: "YYYY-MM-DD" number_format: decimal_separator: "." thousands_separator: "," --- END FILE --- --- FILE: spec/profiles/print_pdf.yaml --- profile_id: "print_pdf" description: "Print-oriented PDF with stricter pagination, book-like rhythm, and stronger keep constraints." page: size: "6in×9in" orientation: "portrait" two_sided: true margins: top: "18mm" bottom: "20mm" inner: "22mm" outer: "18mm" fonts: body: family: ["STIX Two Text", "Noto Serif", "Georgia", "serif"] size: "10.5pt" line_height: 1.50 heading: family: ["STIX Two Text", "Noto Serif", "serif"] mono: family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"] size: "9.5pt" line_height: 1.30 measure_targets: columns: 1 body_chars_per_line: min: 55 ideal: 66 max: 72 hyphenation: enabled: true strategy: "print_quality" min_left: 2 min_right: 3 max_consecutive_hyphenated_lines: 2 avoid_proper_names_when_possible: true paragraphs: first_paragraph_indent: "0" indent: "1em" headings: keep_with_next_lines: 3 avoid_stranded_headings: true numbering: enabled: true style: "decimal" require_monotonic_increase: true widows_orphans: widow_lines: 2 orphan_lines: 2 balance_facing_pages: true code: block: font_size: "9pt" line_height: 1.25 wrap: false overflow_policy: "shrink_then_scroll_indicator" shrink_limit: 0.90 tables: cell_padding: "2.5pt 5pt" header_repeat: true overflow_policy: "shrink_then_rotate_if_allowed" shrink_limit: 0.88 severity_overrides: * selector: { category: "layout", tag: "widows_orphans" } severity: "must" * selector: { category: "layout", tag: "keep_constraints" } severity: "must" * selector: { category: "typography", tag: "spacing_consistency" } severity: "must" locale_defaults: primary_language: "en" fallback_languages: ["fr"] quotation_style: "us" date_format: "Month D, YYYY" number_format: decimal_separator: "." thousands_separator: "," --- END FILE --- --- FILE: spec/profiles/dense_tech.yaml --- profile_id: "dense_tech" description: "Technical papers and specs: denser copy, more code/table tolerance, strict numbering and citations." page: size: "A4" orientation: "portrait" two_sided: false margins: top: "18mm" bottom: "18mm" inner: "18mm" outer: "18mm" fonts: body: family: ["Noto Serif", "STIX Two Text", "serif"] size: "10pt" line_height: 1.35 heading: family: ["Noto Sans", "Source Sans 3", "sans-serif"] mono: family: ["Noto Sans Mono", "Source Code Pro", "monospace"] size: "9pt" line_height: 1.25 measure_targets: columns: 1 body_chars_per_line: min: 65 ideal: 75 max: 90 hyphenation: enabled: true strategy: "balanced" min_left: 2 min_right: 3 max_consecutive_hyphenated_lines: 3 avoid_proper_names_when_possible: true headings: keep_with_next_lines: 2 avoid_stranded_headings: true numbering: enabled: true style: "decimal" require_monotonic_increase: true widows_orphans: widow_lines: 2 orphan_lines: 2 balance_facing_pages: false code: block: font_size: "8.8pt" line_height: 1.20 wrap: true overflow_policy: "wrap_then_shrink_minor" shrink_limit: 0.90 tables: cell_padding: "2pt 4pt" header_repeat: true overflow_policy: "shrink_then_wrap" shrink_limit: 0.85 severity_overrides: * selector: { category: "citations" } severity: "must" * selector: { category: "headings", tag: "numbering" } severity: "must" * selector: { category: "layout", tag: "widows_orphans" } severity: "should" locale_defaults: primary_language: "en" fallback_languages: ["fr"] quotation_style: "us" date_format: "YYYY-MM-DD" number_format: decimal_separator: "." thousands_separator: "," --- END FILE --- --- FILE: spec/profiles/memo.yaml --- profile_id: "memo" description: "Short internal documents: lenient pagination, strong clarity, minimal typographic complexity." page: size: "Letter" orientation: "portrait" two_sided: false margins: top: "1in" bottom: "1in" inner: "1in" outer: "1in" fonts: body: family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"] size: "11pt" line_height: 1.40 heading: family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"] mono: family: ["Noto Sans Mono", "Consolas", "monospace"] size: "10pt" line_height: 1.30 measure_targets: columns: 1 body_chars_per_line: min: 55 ideal: 70 max: 85 hyphenation: enabled: false strategy: "off_for_memos" headings: keep_with_next_lines: 2 avoid_stranded_headings: true numbering: enabled: false widows_orphans: widow_lines: 1 orphan_lines: 1 balance_facing_pages: false code: block: font_size: "9.5pt" line_height: 1.25 wrap: true overflow_policy: "wrap" shrink_limit: 1.0 tables: cell_padding: "3pt 6pt" header_repeat: false overflow_policy: "wrap" shrink_limit: 1.0 severity_overrides: * selector: { category: "layout", tag: "widows_orphans" } severity: "warn" * selector: { category: "accessibility" } severity: "must" locale_defaults: primary_language: "en" fallback_languages: ["fr"] quotation_style: "us" date_format: "YYYY-MM-DD" number_format: decimal_separator: "." thousands_separator: "," --- END FILE --- --- FILE: spec/profiles/slide_deck.yaml --- profile_id: "slide_deck" description: "Paged slides (16:9). Emphasis on hierarchy, short lines, and avoiding overflows." page: size: "13.333in×7.5in" # 16:9 at common PPT dimensions orientation: "landscape" two_sided: false margins: top: "0.5in" bottom: "0.5in" inner: "0.6in" outer: "0.6in" fonts: body: family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"] size: "24pt" line_height: 1.15 heading: family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"] mono: family: ["Noto Sans Mono", "Consolas", "monospace"] size: "20pt" line_height: 1.10 measure_targets: columns: 1 body_chars_per_line: min: 25 ideal: 40 max: 55 hyphenation: enabled: false strategy: "off_for_slides" headings: keep_with_next_lines: 1 avoid_stranded_headings: true numbering: enabled: false widows_orphans: widow_lines: 1 orphan_lines: 1 balance_facing_pages: false code: block: font_size: "18pt" line_height: 1.10 wrap: true overflow_policy: "wrap_then_shrink_minor" shrink_limit: 0.92 tables: cell_padding: "6pt 10pt" header_repeat: false overflow_policy: "shrink_then_wrap" shrink_limit: 0.88 severity_overrides: * selector: { category: "layout", tag: "overflow" } severity: "must" * selector: { category: "accessibility" } severity: "must" locale_defaults: primary_language: "en" fallback_languages: ["fr"] quotation_style: "us" date_format: "YYYY-MM-DD" number_format: decimal_separator: "." thousands_separator: "," --- END FILE --- --- FILE: spec/quality_gates.yaml --- version: "0.1.0" description: > Post-render QA gates. All thresholds are hard numeric limits used to fail builds (unless a gate is explicitly marked as "warn-only" by the invoking CLI flags). metrics: max_widows_per_10_pages: "Count of widow lines across any 10 consecutive pages." max_orphans_per_10_pages: "Count of orphan lines across any 10 consecutive pages." max_stranded_headings: "Count of headings at page bottom with insufficient following content per keep rule." max_overfull_lines: "Count of lines exceeding measure by overflow threshold (render-time measured)." max_table_overflow_incidents: "Count of tables that overflow page/column bounds or are clipped." max_code_overflow_incidents: "Count of code blocks with horizontal overflow or clipping." max_link_wrap_incidents: "Count of wrapped URLs/DOIs/emails violating link wrap policy." max_heading_numbering_errors: "Count of numbering sequence/format violations." max_citation_format_errors: "Count of citations not matching configured style format." overflow_detection: overfull_line_threshold_css_px: 1.0 consider_clipping_as_overflow: true ignore_decorative_elements: true profiles: web_pdf: default: max_widows_per_10_pages: 1 max_orphans_per_10_pages: 1 max_stranded_headings: 0 max_overfull_lines: 2 max_table_overflow_incidents: 0 max_code_overflow_incidents: 1 max_link_wrap_incidents: 2 max_heading_numbering_errors: 0 max_citation_format_errors: 0 strict: max_widows_per_10_pages: 0 max_orphans_per_10_pages: 0 max_stranded_headings: 0 max_overfull_lines: 0 max_table_overflow_incidents: 0 max_code_overflow_incidents: 0 max_link_wrap_incidents: 0 max_heading_numbering_errors: 0 max_citation_format_errors: 0 print_pdf: default: max_widows_per_10_pages: 0 max_orphans_per_10_pages: 0 max_stranded_headings: 0 max_overfull_lines: 0 max_table_overflow_incidents: 0 max_code_overflow_incidents: 0 max_link_wrap_incidents: 0 max_heading_numbering_errors: 0 max_citation_format_errors: 0 strict: max_widows_per_10_pages: 0 max_orphans_per_10_pages: 0 max_stranded_headings: 0 max_overfull_lines: 0 max_table_overflow_incidents: 0 max_code_overflow_incidents: 0 max_link_wrap_incidents: 0 max_heading_numbering_errors: 0 max_citation_format_errors: 0 dense_tech: default: max_widows_per_10_pages: 1 max_orphans_per_10_pages: 1 max_stranded_headings: 0 max_overfull_lines: 3 max_table_overflow_incidents: 1 max_code_overflow_incidents: 2 max_link_wrap_incidents: 3 max_heading_numbering_errors: 0 max_citation_format_errors: 0 strict: max_widows_per_10_pages: 0 max_orphans_per_10_pages: 0 max_stranded_headings: 0 max_overfull_lines: 1 max_table_overflow_incidents: 0 max_code_overflow_incidents: 0 max_link_wrap_incidents: 1 max_heading_numbering_errors: 0 max_citation_format_errors: 0 memo: default: max_widows_per_10_pages: 3 max_orphans_per_10_pages: 3 max_stranded_headings: 0 max_overfull_lines: 2 max_table_overflow_incidents: 1 max_code_overflow_incidents: 1 max_link_wrap_incidents: 4 max_heading_numbering_errors: 1 max_citation_format_errors: 1 strict: max_widows_per_10_pages: 1 max_orphans_per_10_pages: 1 max_stranded_headings: 0 max_overfull_lines: 0 max_table_overflow_incidents: 0 max_code_overflow_incidents: 0 max_link_wrap_incidents: 2 max_heading_numbering_errors: 0 max_citation_format_errors: 0 slide_deck: default: max_widows_per_10_pages: 5 max_orphans_per_10_pages: 5 max_stranded_headings: 0 max_overfull_lines: 0 max_table_overflow_incidents: 0 max_code_overflow_incidents: 0 max_link_wrap_incidents: 0 max_heading_numbering_errors: 0 max_citation_format_errors: 1 strict: max_widows_per_10_pages: 2 max_orphans_per_10_pages: 2 max_stranded_headings: 0 max_overfull_lines: 0 max_table_overflow_incidents: 0 max_code_overflow_incidents: 0 max_link_wrap_incidents: 0 max_heading_numbering_errors: 0 max_citation_format_errors: 0 --- END FILE --- --- FILE: spec/indexes/README.md --- # Indexes This project builds small, fast indexes so the runtime can answer questions like: * “Which rules mention *en dash*?” * “Which rules cite *CMOS18 §6.88 p412*?” * “Which rules apply to `postrender` QA?” * “What rules are overridden by the `print_pdf` profile?” Indexes are derived artifacts (rebuildable) and should not be hand-edited. ## Indexes the app will build ### 1) keyword → rule IDs **Purpose:** fast search/autocomplete and lint explanations. * **Path:** `spec/indexes/keywords_all.json` and per-category deltas: * `spec/indexes/keywords_.json` * **Format (JSON):** * keys: normalized keyword (lowercased) * values: array of rule IDs sorted stable (lexicographic) Normalization (default): * Unicode NFKC * lowercase * collapse whitespace * strip surrounding punctuation ### 2) source_ref → rule IDs **Purpose:** audit trail back to references without embedding book text. * **Path:** `spec/indexes/source_refs_all.json` and per-category deltas: * `spec/indexes/source_refs_.json` * **Format (JSON):** * keys: exact `source_ref` pointer strings * values: array of rule IDs ### 3) category → rule IDs **Purpose:** batch reporting, extraction coverage, profile scoping. * **Path:** `spec/indexes/category.json` * **Format (JSON):** * keys: category name * values: array of rule IDs ### 4) enforcement → rule IDs **Purpose:** quickly decide which engine (lint/typeset/postrender/manual) handles which rules. * **Path:** `spec/indexes/enforcement.json` ### 5) profile overrides **Purpose:** allow profiles to override severity or token parameters without editing rules. * **Path:** `spec/indexes/profile_overrides.json` * **Format (JSON):** * per profile: list of override objects (selector + action) * selectors may match category, tags, applies_to, or explicit rule IDs ## Build guarantees * Index builds are deterministic from: * `spec/rules/**.ndjson` * `spec/profiles/*.yaml` * `spec/manifest.yaml` * The runtime must treat indexes as **cacheable**: * if index missing/outdated → rebuild or fallback to scanning rule files. --- END FILE --- --- FILE: spec/examples/README.md --- # Examples Rules stay compact and machine-enforceable; examples live separately to avoid bloating the rule registry. ## Goals * Provide **concrete fixtures** for: * unit tests (lint, autofix, typeset transforms) * integration tests (render + QA gates) * documentation (human-readable “why this matters”) * Keep examples **small** (a few lines) and **targeted** (each example triggers a known set of rules). ## Example ID format `EX...` * `CATEGORY` must match the category taxonomy (e.g., `PUNCTUATION`, `NUMBERS`, `CITATIONS`) * `TOPIC` is an uppercase short slug * `NNN` is a zero-padded integer (000–999+) Example: * `EX.PUNCTUATION.DASHES.001` ## Suggested on-disk layout * `spec/examples//EX....yaml` * `spec/examples//fixtures/.md` (optional) ## Example YAML format (recommended) Fields: * `id` (required): example ID * `rules` (required): list of rule IDs the example is meant to exercise * `before` (required): inline Markdown or a reference to a fixture file * `after` (optional): expected Markdown after autofix (if autofix exists) * `expected` (optional): expected diagnostics/gates * `lint_errors`: array of rule IDs expected as errors * `lint_warnings`: array of rule IDs expected as warnings * `qa_failures`: array of gate keys expected to fail * `notes` (optional): short human explanation (no book quotes) Minimal example skeleton: * id: EX.PUNCTUATION.DASHES.001 rules: * CMOS.PUNCTUATION.DASHES.EM_DASH before: | ... after: | ... expected: lint_errors: [CMOS.PUNCTUATION.DASHES.EM_DASH] ## Test corpus strategy Maintain a small, curated corpus that triggers: 1. Lint-only issues (AST-level) * punctuation spacing * numeral formatting * heading numbering patterns * link normalization / unsafe URLs * citation field completeness 2. Typeset-only issues (token/CSS decisions) * paragraph indentation patterns * code block wrapping rules * table overflow strategies 3. Post-render QA issues (PDF/HTML layout) * widows/orphans * stranded headings (keep-with-next) * overfull lines (especially monospace/code) * table/caption overflow and clipping Recommended corpus sizing: * 30–80 fixtures total * each fixture should target 3–10 rules max * include “degraded mode” fixtures (intentionally malformed Markdown) --- END FILE --- --- FILE: app/ARCHITECTURE.md --- # Runtime Architecture This is a thin, deterministic runtime that: A) ingests Markdown → normalizes a document AST → applies editorial lint (Chicago-derived) B) applies typeset tokens/profile (Bringhurst-derived) C) renders HTML and PDF deterministically D) runs post-render QA gates (widows/orphans, heading keeps, overflow) E) generates `layout-report.json` and fails builds when thresholds are exceeded Primary reference PDFs provided to the system (for pointer-based rules and traceability only): * The Chicago Manual of Style (18th ed).pdf * Robert Bringhurst – The Elements of Typographic Style.pdf No bulk transcription is performed; rules are paraphrases and cite sources only by pointer. ## Components ### 1) Registry Loader Inputs: * `spec/rules/**.ndjson` (Phase 2 output) * `spec/schema/rule.schema.json` * `spec/manifest.yaml` * `spec/profiles/*.yaml` * `spec/quality_gates.yaml` Responsibilities: * validate each rule against JSON Schema * enforce ID uniqueness and stable sorting * build or load indexes in `spec/indexes/*.json` * compute coverage (implemented vs unimplemented; by enforcement) Output (in-memory): * `RuleStore` (rules + indexes + profile overrides + gate thresholds) ### 2) Markdown Ingest + AST Normalization Steps: 1. Parse Markdown to an AST (mdast or equivalent). 2. Normalize to a stable internal schema: * heading levels and numbering metadata * lists and list tight/loose semantics * code spans/blocks with language tags * tables (GFM) to a consistent representation * links normalized (url, title, text) * citations normalized (if present as syntax/extensions) 3. Produce `normalized-doc.json` for debugging reproducibility. Degraded mode: * If parsing fails or structure is missing, switch to minimal node set and mark `structure_confidence: low`. * Run the “degraded mode contract” from `spec/manifest.yaml`. ### 3) Editorial Lint Engine What it does: * Runs `lint`-enforced rules against normalized AST. * Emits diagnostics: * `severity` (must/should/warn after profile overrides) * `rule_id` * location (source span) and node path * message (generated from rule metadata + implementation hints) Autofix: * If a rule’s `autofix` is `rewrite` or `suggest`, produce: * patched Markdown (rewrite) OR * suggestion blocks with exact spans to edit (suggest) * Autofix must be deterministic and reversible (keep a patch log). Artifacts: * `lint-report.json` * `lint-report.sarif` (optional for CI UIs) * `lint-fixed.md` (optional, if autofix applied) ### 4) Typeset Profile Engine Goal: * Convert “typographic intent” into deterministic render inputs: * CSS tokens (variables) * layout policies (widows/orphans strategy, keeps, hyphenation params) * code/table overflow strategies Inputs: * normalized AST * profile tokens from `spec/profiles/.yaml` Outputs: * `render.css` (tokenized CSS + paged-media rules) * `render.html` (deterministic HTML with stable classnames/data attributes) * `typeset-report.json` (what tokens were used, resolved font stack, measure targets) Design principle: * “Soft rules” are tokens; “hard rules” are lint/QA gates. ### 5) Deterministic Rendering The runtime should treat rendering as an adapter layer. Minimum requirement: * Deterministic HTML generation (stable DOM order, stable IDs, stable whitespace). * Deterministic PDF generation with pinned renderer/version and embedded fonts when possible. Adapter concept: * `RendererHTML`: emits HTML+CSS. * `RendererPDF`: converts HTML+CSS to PDF using a configured engine. Recommended renderer capabilities: * CSS Paged Media support (page size/margins, running headers, footnotes if used) * hyphenation dictionaries * font embedding/subsetting Artifacts: * `out/.html` * `out/.pdf` * `out/render-log.json` (versions, timings, warnings) ### 6) Post-render QA Analyzer Runs on: * PDF (preferred for final layout truth) and optionally HTML. Detects: * widows/orphans (by paragraph line runs across pages) * stranded headings (heading at bottom violating keep-with-next) * overfull lines (glyph boxes exceed text block) * table overflow/clipping * code overflow/clipping * link wrap incidents (URLs/DOIs split against policy) * heading numbering errors (cross-check against AST numbering) * citation format errors (cross-check against configured citation style) Artifacts: * `layout-report.json` (the canonical QA report) * `qa-report.json` (gate evaluation + failures + excerpts as coordinates, not text) Fail behavior: * Compare measured metrics to `spec/quality_gates.yaml` for the chosen profile. * Exit non-zero if any MUST-equivalent gate fails (or if `--strict` chosen, strict thresholds apply). ## Coverage Reporting and CI Guardrails Coverage is computed from: * total active rules * rules with an implemented enforcement handler: * lint implemented if rule_id has an evaluator in lint engine * typeset implemented if token/policy exists and is applied deterministically * postrender implemented if analyzer has a detector for that rule/tag * manual implemented if checklist output includes it Artifacts: * `coverage-report.json` (counts by category, enforcement, severity, profile) * `coverage-diff.json` (compares to baseline on main branch) CI policy (from manifest): * fail if MUST coverage drops * fail if overall implemented coverage drops * fail if rule IDs changed without deprecation mapping ## Assumptions (Phase 1 defaults) 1. The pipeline targets a CSS-based HTML→PDF engine (paged media capable). 2. Default language is `en` with optional `fr` fallback. 3. Citation style defaults to a Chicago-aligned style, but the registry will encode the exact variant in rules (Notes/Bibliography vs Author-Date) during extraction. 4. Fonts default to Noto/STIX families for broad coverage and consistent embedding, but can be overridden per profile. No questions are strictly required to proceed with Phase 2 extraction; these assumptions can be adjusted via profiles and house rules. --- END FILE --- --- FILE: app/CLI_SPEC.md --- # CLI Specification The CLI is designed for CI use: deterministic outputs, stable exit codes, and JSON artifacts for tooling. ## Common flags (all commands) * `--input `: Markdown file or directory. * `--out `: Output directory (default: `out/`). * `--rules `: Rules root directory (default: `spec/rules/`). * `--profile `: One of: `web_pdf`, `print_pdf`, `dense_tech`, `memo`, `slide_deck`. * `--strict`: Use strict thresholds in `spec/quality_gates.yaml`. * `--format `: Diagnostic output format (where applicable). * `--fail-on `: Lowest severity that fails the command (default: `must`). * `--degraded-ok`: Allow degraded mode without failing (still emits degraded-mode report). * `--version`: Print tool + renderer versions. ## Command: `lint` Purpose: * Parse Markdown → normalize AST → run lint rules. * Optionally apply autofixes. Args: * `--fix`: Apply autofix where `autofix != none` and safe. * `--fix-mode `: Whether to rewrite output Markdown or emit suggestions only. * `--baseline `: Compare diagnostics to an existing lint report and show diff. Outputs: * `out/lint-report.json` * `out/lint-report.sarif` (if `--format sarif`) * `out/lint-fixed.md` (if `--fix` and `--fix-mode rewrite`) * `out/manual-checklist.md` (includes manual rules tagged `manual_checklist=true`) Exit codes: * `0`: no failing diagnostics * `1`: lint failures at or above `--fail-on` * `4`: config/schema error * `5`: internal error ## Command: `render-html` Purpose: * Generate deterministic HTML + CSS from normalized AST + profile tokens. Args: * `--emit-normalized`: also write `normalized-doc.json` * `--assets `: static assets dir (images, fonts, etc.) * `--self-contained`: embed assets in HTML where possible Outputs: * `out/render.html` * `out/render.css` * `out/typeset-report.json` * `out/normalized-doc.json` (optional) Exit codes: * `0`: success * `3`: render error * `4`: config/schema error * `5`: internal error ## Command: `render-pdf` Purpose: * Render PDF deterministically from HTML + CSS + assets. Args: * `--engine `: renderer adapter selection (implementation-defined) * `--engine-opts `: pass-through engine options * `--keep-html`: keep intermediate HTML/CSS even if PDF fails Outputs: * `out/render.pdf` * `out/render.html` + `out/render.css` (always or if `--keep-html`) * `out/render-log.json` Exit codes: * `0`: success * `3`: render error * `4`: config/schema error * `5`: internal error ## Command: `qa` Purpose: * Run post-render QA analysis and evaluate quality gates. Args: * `--pdf `: PDF to analyze (default: `out/render.pdf`) * `--html `: optional HTML for cross-checks * `--gates `: override gates file (default: `spec/quality_gates.yaml`) Outputs: * `out/layout-report.json` * `out/qa-report.json` Exit codes: * `0`: all gates pass * `2`: gates failed (at or above `--fail-on` / strictness) * `4`: config/schema error * `5`: internal error ## Command: `report` Purpose: * Produce a consolidated report: * coverage (implemented vs unimplemented) * diffs vs baseline * per-category enforcement breakdown Args: * `--baseline `: baseline coverage report to diff against * `--since `: optionally compute diffs since a git ref (implementation-defined) Outputs: * `out/coverage-report.json` * `out/coverage-diff.json` (if baseline provided) * `out/coverage-summary.md` Exit codes: * `0`: report built and coverage passes configured floors * `2`: coverage floor violated * `4`: config/schema error * `5`: internal error --- END FILE --- --- FILE: spec/extraction_plan.md --- # Phase 2 Extraction Plan This plan defines how rules will be produced in controlled batches without reproducing the books. ## Non-negotiables (carried into Phase 2) * No full-book OCR/transcription. * No long verbatim passages. * Rules are paraphrased and capped (`rule_text` ≤ 800 chars). * Every rule includes at least one source pointer in `source_refs`. * If a rule depends on exact wording, the rule still paraphrases but must include: * `rule_text`: “Exact wording required—refer to pointer” * plus a usable pointer. Primary reference PDFs for pointer extraction: * The Chicago Manual of Style (18th ed).pdf * Robert Bringhurst – The Elements of Typographic Style.pdf ## Output batching format When you say: `EXTRACT []` I will output a bundle that includes: 1. **Rules NDJSON** (150–250 rule records) * Path: `spec/rules//.ndjson` * One JSON object per line, validated against `spec/schema/rule.schema.json`. 2. **Index deltas** for that category * `spec/indexes/keywords_.json` * `spec/indexes/source_refs_.json` * `spec/indexes/coverage_delta_.json` 3. **Coverage notes** report * A short Markdown report describing enforcement split: * lint vs typeset vs postrender vs manual * plus any known gaps or “manual-only” areas ## Batch naming `` format: * `v1__` * e.g., `v1_punctuation_001` Batches are append-only: * If rules need revision, mark old rule `deprecated`, add a new rule ID (or new version segment) and keep both records. ## Pointer scheme details Pointer strings live in `source_refs[]` and are **not** quotes. Preferred pointer format: * `CMOS18 §
p` * `BRING §
p` * Optional disambiguation: `(scan p)` Example pattern (not a quote): * `CMOS18 §6.1 p377 (scan p10)` Notes: * “book_page” uses the printed page number in the book when present (arabic or roman). * “scan p” uses the PDF page index when printed page numbers are ambiguous. ## Recommended extraction order (high-impact first) 1. numbers 2. punctuation 3. citations 4. headings 5. tables 6. figures 7. links 8. code 9. layout (widows/orphans, keeps, overflow) 10. front/back matter 11. accessibility 12. i18n Rationale: * Numbers/punctuation/citations most directly affect correctness, consistency, and auditability. * Layout rules benefit from having structure and tokens in place. ## Scope parameter `[]` can constrain extraction, e.g.: * `EXTRACT punctuation basic` * `EXTRACT citations notes_bibliography` * `EXTRACT numbers en_only` * `EXTRACT layout widows_orphans` If scope is omitted: * extract the most generally applicable rules for that category first. ## Enforcement mapping guidelines (honest labeling) * `lint`: detectable from AST or text normalization (e.g., spacing, punctuation patterns, citation fields). * `typeset`: enforced via CSS/tokens/paged-media decisions (e.g., indent policy, measure targets, hyphenation params). * `postrender`: requires layout inspection after rendering (e.g., widows/orphans, overfull lines, table clipping). * `manual`: cannot be reliably automated; must include `tags: ["manual_checklist=true"]` and be emitted into checklist outputs. If a concept spans multiple enforcement layers: * Prefer splitting into two rules: * one lint rule (source cleanliness) * one postrender rule (layout outcome) * Use `dependencies` to link them. ## Extraction workflow per category (repeatable) For each category: 1. Build a topic map (subtopics, recurring failure modes). 2. Extract rules in clusters: * MUST rules first (enforceable or checklist) * SHOULD rules next * WARN rules last 3. For each rule: * add `keywords` for searchability * add tags for overrides/routing * add minimal `exceptions` when needed (avoid overfitting) ## “Degraded mode” considerations during extraction For each category batch, include some rules that specifically target degraded inputs: * hard-wrap repair suggestions * heading inference warnings * link sanitation and encoding fixes * Unicode normalization notes These rules should generally be `warn` or `should`, unless they prevent corruption (then `must`). --- END FILE ---