forgejo-pdf/pub-style-skeleton.txt

--- FILE: spec/schema/rule.schema.json ---
{
"$schema": "[https://json-schema.org/draft/2020-12/schema](https://json-schema.org/draft/2020-12/schema)",
"$id": "[https://example.invalid/pubstyle/spec/schema/rule.schema.json](https://example.invalid/pubstyle/spec/schema/rule.schema.json)",
"title": "Publication-quality rule record",
"type": "object",
"additionalProperties": false,
"required": [
"id",
"title",
"source_refs",
"category",
"severity",
"applies_to",
"rule_text",
"rationale",
"enforcement",
"autofix",
"autofix_notes",
"tags",
"keywords",
"dependencies",
"exceptions",
"status"
],
"properties": {
"id": {
"type": "string",
"description": "Stable rule identifier. Prefix must be one of CMOS, BRING, HOUSE.",
"minLength": 6,
"maxLength": 120,
"pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$"
},
"title": {
"type": "string",
"description": "Short human-readable rule title.",
"minLength": 4,
"maxLength": 160
},
"source_refs": {
"type": "array",
"description": "Pointers back to sources. Must be pointers, not quotes. Prefer: "CMOS18 §X.Y pN" / "BRING §X.Y pN" / "HOUSE §X.Y pN".",
"minItems": 1,
"items": {
"type": "string",
"minLength": 8,
"maxLength": 120,
"pattern": "^(CMOS18|BRING|HOUSE)\s§[0-9A-Za-z][0-9A-Za-z.\-]*\s p[0-9ivxlcdmIVXLCDM]+(?:-[0-9ivxlcdmIVXLCDM]+)?(?:\s\(scan p[0-9]+\))?$"
}
},
"category": {
"type": "string",
"description": "Primary taxonomy bucket.",
"enum": [
"editorial",
"typography",
"layout",
"headings",
"citations",
"numbers",
"punctuation",
"abbreviations",
"links",
"tables",
"figures",
"code",
"frontmatter",
"backmatter",
"accessibility",
"i18n"
]
},
"severity": {
"type": "string",
"description": "Normativity level. MUST blocks release unless downgraded by profile.",
"enum": ["must", "should", "warn"]
},
"applies_to": {
"type": "string",
"description": "Which pipeline stage(s) the rule targets.",
"enum": ["md", "html", "pdf", "all"]
},
"rule_text": {
"type": "string",
"description": "Paraphrased rule statement (no long quotes). If exact wording matters, note: "Exact wording required—refer to pointer".",
"minLength": 10,
"maxLength": 800
},
"rationale": {
"type": "string",
"description": "One-line rationale.",
"minLength": 5,
"maxLength": 200
},
"enforcement": {
"type": "string",
"description": "Primary enforcement mechanism.",
"enum": ["lint", "typeset", "postrender", "manual"]
},
"autofix": {
"type": "string",
"description": "Autofix capability, if any.",
"enum": ["none", "rewrite", "reflow", "suggest"]
},
"autofix_notes": {
"type": "string",
"description": "Notes describing what can be fixed and how/when. Keep short; never include book quotes.",
"maxLength": 400
},
"tags": {
"type": "array",
"description": "Compact labels for routing/search/overrides (e.g., 'manual_checklist=true', 'widows_orphans', 'hyphenation').",
"items": {
"type": "string",
"minLength": 1,
"maxLength": 48,
"pattern": "^[a-z0-9][a-z0-9_.:\-/]*(?:=[a-z0-9_.:\-/]+)?$"
},
"maxItems": 64
},
"keywords": {
"type": "array",
"description": "Search keywords (human-oriented; not necessarily normalized).",
"items": {
"type": "string",
"minLength": 2,
"maxLength": 48
},
"maxItems": 64
},
"dependencies": {
"type": "array",
"description": "Rule IDs that should be applied/understood first.",
"items": {
"type": "string",
"pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$"
},
"maxItems": 32
},
"exceptions": {
"type": "array",
"description": "Free-text exceptions/caveats. Keep concise.",
"items": {
"type": "string",
"minLength": 3,
"maxLength": 240
},
"maxItems": 32
},
"examples_ref": {
"type": "array",
"description": "Optional references to separately stored examples (see spec/examples/README.md).",
"items": {
"type": "string",
"minLength": 6,
"maxLength": 80,
"pattern": "^EX\.[A-Z0-9_]+\.[A-Z0-9_]+\.[0-9]{3,}$"
},
"maxItems": 64
},
"implementation_notes": {
"type": "string",
"description": "Optional short notes for implementers (no quotes).",
"minLength": 3,
"maxLength": 600
},
"status": {
"type": "string",
"description": "Lifecycle state.",
"enum": ["draft", "active", "deprecated"]
}
},
"allOf": [
{
"if": {
"properties": {
"autofix": { "enum": ["rewrite", "reflow", "suggest"] }
},
"required": ["autofix"]
},
"then": {
"properties": {
"autofix_notes": { "minLength": 1 }
}
}
}
]
}
--- END FILE ---

--- FILE: spec/manifest.yaml ---
version: "0.1.0"
registry_id: "pubstyle"
description: >
Machine-readable style+typesetting rules for a Markdown→HTML→PDF pipeline,
backed by primary references (Chicago / Bringhurst) and optional house rules.
Rules are paraphrases only; sources are referenced by pointer strings.

id_naming:
prefixes:
CMOS: "Editorial/style usage rules derived primarily from Chicago."
BRING: "Typographic/layout rules derived primarily from Bringhurst."
HOUSE: "Project-specific rules not directly sourced to Chicago/Bringhurst."
pattern: "PREFIX.DOMAIN.TOPIC[.SUBTOPIC[.DETAIL...]]"
delimiter: "."
casing: "UPPER_SNAKE for segments"
stability:
rule_ids_are_immutable: true
rename_policy: "Deprecate old id; introduce new id; keep mapping in report diffs."
examples:
- "CMOS.PUNCTUATION.DASHES.EM_DASH"
- "BRING.LAYOUT.WIDOWS_ORPHANS.AVOID"
- "HOUSE.CITATIONS.DOI.PREFER_HTTPS"

source_pointer_scheme:
goal: "Provide auditable traceability without reproducing sources."
pointer_format_primary: "CMOS18 §<section> p<book_page>"
pointer_format_secondary: "BRING §<section> p<book_page>"
pointer_format_house: "HOUSE §<section> p<doc_page>"
optional_scan_hint: "(scan p<pdf_page_index>)"
allowed_page_numbering: ["arabic", "roman"]
notes:
- "Pointers must be sufficient for a reader with the book to locate the guidance."
- "Never store verbatim passages; paraphrase only."
- "If a rule depends on exact wording, rule_text must say: 'Exact wording required—refer to pointer'."

category_taxonomy:

* editorial
* typography
* layout
* headings
* citations
* numbers
* punctuation
* abbreviations
* links
* tables
* figures
* code
* frontmatter
* backmatter
* accessibility
* i18n

profiles:

* web_pdf
* print_pdf
* dense_tech
* memo
* slide_deck

planned_rule_counts:
target_total_range: [800, 1500]
target_by_category:
editorial: 120
typography: 170
layout: 140
headings: 70
citations: 140
numbers: 90
punctuation: 120
abbreviations: 60
links: 50
tables: 60
figures: 50
code: 70
frontmatter: 40
backmatter: 40
accessibility: 90
i18n: 60

coverage_contract:
must_rules:
enforceability_requirement: >
Every MUST rule must be enforceable by at least one of: lint, typeset, postrender;
otherwise it must be explicitly labeled as a manual checklist item and emitted in
a checklist output artifact.
manual_checklist_tag: "manual_checklist=true"
checklist_artifact: "manual-checklist.md (and JSON mirror)"
should_rules:
policy: "Should rules should be enforceable when practical; otherwise allowed as manual with explicit rationale."
warn_rules:
policy: "Warnings may be non-blocking and advisory; still require source pointers."
enforcement_definitions:
lint: "Static analysis over normalized Markdown/HTML AST. Deterministic."
typeset: "CSS/tokens shaping decisions prior to rendering (pagination, keeps, hyphenation parameters)."
postrender: "PDF/HTML layout inspection (widows/orphans, overflow, keep failures, numbering mismatches)."
manual: "Human review; system must still produce checklist items and traceability pointers."
ci_guardrails:
coverage_floor:
must_implemented_min_percent: 95
overall_implemented_min_percent: 80
regression_rule: "CI fails if implemented coverage decreases from main branch."

degraded_mode_contract:
purpose: "Handle badly-structured inputs safely without crashing; still provide useful output."
triggers:
- "Markdown parse errors / invalid UTF-8"
- "Missing heading hierarchy (no H1/H2 etc.)"
- "Garbage extraction (e.g., line breaks every word, excessive hard wraps)"
- "Mixed language with no lang metadata"
behavior:
normalize:
attempt_repairs:
- "Normalize whitespace and line endings"
- "Detect and unwrap hard-wrapped paragraphs heuristically"
- "Infer heading levels from patterns (e.g., '1.', '1.1', ALL CAPS lines) with low confidence"
if_unrecoverable:
- "Fall back to minimal AST: paragraphs + code blocks + raw spans"
- "Mark document structure confidence = low"
enforcement_in_degraded_mode:
lint:
run_subset: ["safety", "sanity", "catastrophic typography (double spaces, broken links)"]
downgrade_some_must_to_warn: true
typeset:
use_fallback_tokens: true
disable_aggressive_hyphenation: true
postrender:
run_core_gates_only: ["overfull_lines", "table_overflow_incidents", "code_overflow_incidents"]
reporting:
always_emit:
- "layout-report.json"
- "coverage-report.json"
- "degraded-mode-report.json (what was inferred and why)"
--- END FILE ---

--- FILE: spec/profiles/web_pdf.yaml ---
profile_id: "web_pdf"
description: "Screen-first PDF for sharing and reading; conservative pagination and strong accessibility defaults."

page:
size: "Letter"
orientation: "portrait"
two_sided: false
margins:
top: "22mm"
bottom: "22mm"
inner: "20mm"
outer: "20mm"

fonts:
body:
family: ["Noto Serif", "STIX Two Text", "Times New Roman", "serif"]
size: "11pt"
line_height: 1.45
heading:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"]
size: "10pt"
line_height: 1.35

measure_targets:
columns: 1
body_chars_per_line:
min: 55
ideal: 66
max: 75
footnote_chars_per_line:
min: 50
ideal: 60
max: 70

hyphenation:
enabled: true
strategy: "balanced"
language_driven: true
min_left: 2
min_right: 3
max_consecutive_hyphenated_lines: 2
avoid_proper_names_when_possible: true
avoid_after_short_lines: true

paragraphs:
first_paragraph_indent: "0"
indent: "1em"
block_paragraph_spacing: "0.6em"

headings:
keep_with_next_lines: 2
avoid_stranded_headings: true
numbering:
enabled: true
style: "decimal"
require_monotonic_increase: true

widows_orphans:
widow_lines: 2
orphan_lines: 2
balance_facing_pages: false

code:
inline:
use_mono: true
block:
font_size: "9.5pt"
line_height: 1.35
wrap: true
max_wrap_penalty: "medium"
overflow_policy: "wrap_then_shrink_minor"
shrink_limit: 0.92

tables:
cell_padding: "3pt 6pt"
header_repeat: true
overflow_policy: "shrink_then_wrap"
shrink_limit: 0.9

severity_overrides:

* selector: { category: "layout", tag: "widows_orphans" }
  severity: "should"
* selector: { category: "accessibility" }
  severity: "must"

locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"   # curly quotes, US punctuation conventions
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---

--- FILE: spec/profiles/print_pdf.yaml ---
profile_id: "print_pdf"
description: "Print-oriented PDF with stricter pagination, book-like rhythm, and stronger keep constraints."

page:
size: "6in×9in"
orientation: "portrait"
two_sided: true
margins:
top: "18mm"
bottom: "20mm"
inner: "22mm"
outer: "18mm"

fonts:
body:
family: ["STIX Two Text", "Noto Serif", "Georgia", "serif"]
size: "10.5pt"
line_height: 1.50
heading:
family: ["STIX Two Text", "Noto Serif", "serif"]
mono:
family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"]
size: "9.5pt"
line_height: 1.30

measure_targets:
columns: 1
body_chars_per_line:
min: 55
ideal: 66
max: 72

hyphenation:
enabled: true
strategy: "print_quality"
min_left: 2
min_right: 3
max_consecutive_hyphenated_lines: 2
avoid_proper_names_when_possible: true

paragraphs:
first_paragraph_indent: "0"
indent: "1em"

headings:
keep_with_next_lines: 3
avoid_stranded_headings: true
numbering:
enabled: true
style: "decimal"
require_monotonic_increase: true

widows_orphans:
widow_lines: 2
orphan_lines: 2
balance_facing_pages: true

code:
block:
font_size: "9pt"
line_height: 1.25
wrap: false
overflow_policy: "shrink_then_scroll_indicator"
shrink_limit: 0.90

tables:
cell_padding: "2.5pt 5pt"
header_repeat: true
overflow_policy: "shrink_then_rotate_if_allowed"
shrink_limit: 0.88

severity_overrides:

* selector: { category: "layout", tag: "widows_orphans" }
  severity: "must"
* selector: { category: "layout", tag: "keep_constraints" }
  severity: "must"
* selector: { category: "typography", tag: "spacing_consistency" }
  severity: "must"

locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "Month D, YYYY"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---

--- FILE: spec/profiles/dense_tech.yaml ---
profile_id: "dense_tech"
description: "Technical papers and specs: denser copy, more code/table tolerance, strict numbering and citations."

page:
size: "A4"
orientation: "portrait"
two_sided: false
margins:
top: "18mm"
bottom: "18mm"
inner: "18mm"
outer: "18mm"

fonts:
body:
family: ["Noto Serif", "STIX Two Text", "serif"]
size: "10pt"
line_height: 1.35
heading:
family: ["Noto Sans", "Source Sans 3", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Source Code Pro", "monospace"]
size: "9pt"
line_height: 1.25

measure_targets:
columns: 1
body_chars_per_line:
min: 65
ideal: 75
max: 90

hyphenation:
enabled: true
strategy: "balanced"
min_left: 2
min_right: 3
max_consecutive_hyphenated_lines: 3
avoid_proper_names_when_possible: true

headings:
keep_with_next_lines: 2
avoid_stranded_headings: true
numbering:
enabled: true
style: "decimal"
require_monotonic_increase: true

widows_orphans:
widow_lines: 2
orphan_lines: 2
balance_facing_pages: false

code:
block:
font_size: "8.8pt"
line_height: 1.20
wrap: true
overflow_policy: "wrap_then_shrink_minor"
shrink_limit: 0.90

tables:
cell_padding: "2pt 4pt"
header_repeat: true
overflow_policy: "shrink_then_wrap"
shrink_limit: 0.85

severity_overrides:

* selector: { category: "citations" }
  severity: "must"
* selector: { category: "headings", tag: "numbering" }
  severity: "must"
* selector: { category: "layout", tag: "widows_orphans" }
  severity: "should"

locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---

--- FILE: spec/profiles/memo.yaml ---
profile_id: "memo"
description: "Short internal documents: lenient pagination, strong clarity, minimal typographic complexity."

page:
size: "Letter"
orientation: "portrait"
two_sided: false
margins:
top: "1in"
bottom: "1in"
inner: "1in"
outer: "1in"

fonts:
body:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
size: "11pt"
line_height: 1.40
heading:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Consolas", "monospace"]
size: "10pt"
line_height: 1.30

measure_targets:
columns: 1
body_chars_per_line:
min: 55
ideal: 70
max: 85

hyphenation:
enabled: false
strategy: "off_for_memos"

headings:
keep_with_next_lines: 2
avoid_stranded_headings: true
numbering:
enabled: false

widows_orphans:
widow_lines: 1
orphan_lines: 1
balance_facing_pages: false

code:
block:
font_size: "9.5pt"
line_height: 1.25
wrap: true
overflow_policy: "wrap"
shrink_limit: 1.0

tables:
cell_padding: "3pt 6pt"
header_repeat: false
overflow_policy: "wrap"
shrink_limit: 1.0

severity_overrides:

* selector: { category: "layout", tag: "widows_orphans" }
  severity: "warn"
* selector: { category: "accessibility" }
  severity: "must"

locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---

--- FILE: spec/profiles/slide_deck.yaml ---
profile_id: "slide_deck"
description: "Paged slides (16:9). Emphasis on hierarchy, short lines, and avoiding overflows."

page:
size: "13.333in×7.5in"   # 16:9 at common PPT dimensions
orientation: "landscape"
two_sided: false
margins:
top: "0.5in"
bottom: "0.5in"
inner: "0.6in"
outer: "0.6in"

fonts:
body:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
size: "24pt"
line_height: 1.15
heading:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Consolas", "monospace"]
size: "20pt"
line_height: 1.10

measure_targets:
columns: 1
body_chars_per_line:
min: 25
ideal: 40
max: 55

hyphenation:
enabled: false
strategy: "off_for_slides"

headings:
keep_with_next_lines: 1
avoid_stranded_headings: true
numbering:
enabled: false

widows_orphans:
widow_lines: 1
orphan_lines: 1
balance_facing_pages: false

code:
block:
font_size: "18pt"
line_height: 1.10
wrap: true
overflow_policy: "wrap_then_shrink_minor"
shrink_limit: 0.92

tables:
cell_padding: "6pt 10pt"
header_repeat: false
overflow_policy: "shrink_then_wrap"
shrink_limit: 0.88

severity_overrides:

* selector: { category: "layout", tag: "overflow" }
  severity: "must"
* selector: { category: "accessibility" }
  severity: "must"

locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---

--- FILE: spec/quality_gates.yaml ---
version: "0.1.0"
description: >
Post-render QA gates. All thresholds are hard numeric limits used to fail builds
(unless a gate is explicitly marked as "warn-only" by the invoking CLI flags).

metrics:
max_widows_per_10_pages: "Count of widow lines across any 10 consecutive pages."
max_orphans_per_10_pages: "Count of orphan lines across any 10 consecutive pages."
max_stranded_headings: "Count of headings at page bottom with insufficient following content per keep rule."
max_overfull_lines: "Count of lines exceeding measure by overflow threshold (render-time measured)."
max_table_overflow_incidents: "Count of tables that overflow page/column bounds or are clipped."
max_code_overflow_incidents: "Count of code blocks with horizontal overflow or clipping."
max_link_wrap_incidents: "Count of wrapped URLs/DOIs/emails violating link wrap policy."
max_heading_numbering_errors: "Count of numbering sequence/format violations."
max_citation_format_errors: "Count of citations not matching configured style format."

overflow_detection:
overfull_line_threshold_css_px: 1.0
consider_clipping_as_overflow: true
ignore_decorative_elements: true

profiles:
web_pdf:
default:
max_widows_per_10_pages: 1
max_orphans_per_10_pages: 1
max_stranded_headings: 0
max_overfull_lines: 2
max_table_overflow_incidents: 0
max_code_overflow_incidents: 1
max_link_wrap_incidents: 2
max_heading_numbering_errors: 0
max_citation_format_errors: 0
strict:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0

print_pdf:
default:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0
strict:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0

dense_tech:
default:
max_widows_per_10_pages: 1
max_orphans_per_10_pages: 1
max_stranded_headings: 0
max_overfull_lines: 3
max_table_overflow_incidents: 1
max_code_overflow_incidents: 2
max_link_wrap_incidents: 3
max_heading_numbering_errors: 0
max_citation_format_errors: 0
strict:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 1
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 1
max_heading_numbering_errors: 0
max_citation_format_errors: 0

memo:
default:
max_widows_per_10_pages: 3
max_orphans_per_10_pages: 3
max_stranded_headings: 0
max_overfull_lines: 2
max_table_overflow_incidents: 1
max_code_overflow_incidents: 1
max_link_wrap_incidents: 4
max_heading_numbering_errors: 1
max_citation_format_errors: 1
strict:
max_widows_per_10_pages: 1
max_orphans_per_10_pages: 1
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 2
max_heading_numbering_errors: 0
max_citation_format_errors: 0

slide_deck:
default:
max_widows_per_10_pages: 5
max_orphans_per_10_pages: 5
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 1
strict:
max_widows_per_10_pages: 2
max_orphans_per_10_pages: 2
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0
--- END FILE ---

--- FILE: spec/indexes/README.md ---

# Indexes

This project builds small, fast indexes so the runtime can answer questions like:

* “Which rules mention *en dash*?”
* “Which rules cite *CMOS18 §6.88 p412*?”
* “Which rules apply to `postrender` QA?”
* “What rules are overridden by the `print_pdf` profile?”

Indexes are derived artifacts (rebuildable) and should not be hand-edited.

## Indexes the app will build

### 1) keyword → rule IDs

**Purpose:** fast search/autocomplete and lint explanations.

* **Path:** `spec/indexes/keywords_all.json` and per-category deltas:

  * `spec/indexes/keywords_<category>.json`
* **Format (JSON):**

  * keys: normalized keyword (lowercased)
  * values: array of rule IDs sorted stable (lexicographic)

Normalization (default):

* Unicode NFKC
* lowercase
* collapse whitespace
* strip surrounding punctuation

### 2) source_ref → rule IDs

**Purpose:** audit trail back to references without embedding book text.

* **Path:** `spec/indexes/source_refs_all.json` and per-category deltas:

  * `spec/indexes/source_refs_<category>.json`
* **Format (JSON):**

  * keys: exact `source_ref` pointer strings
  * values: array of rule IDs

### 3) category → rule IDs

**Purpose:** batch reporting, extraction coverage, profile scoping.

* **Path:** `spec/indexes/category.json`
* **Format (JSON):**

  * keys: category name
  * values: array of rule IDs

### 4) enforcement → rule IDs

**Purpose:** quickly decide which engine (lint/typeset/postrender/manual) handles which rules.

* **Path:** `spec/indexes/enforcement.json`

### 5) profile overrides

**Purpose:** allow profiles to override severity or token parameters without editing rules.

* **Path:** `spec/indexes/profile_overrides.json`
* **Format (JSON):**

  * per profile: list of override objects (selector + action)
  * selectors may match category, tags, applies_to, or explicit rule IDs

## Build guarantees

* Index builds are deterministic from:

  * `spec/rules/**.ndjson`
  * `spec/profiles/*.yaml`
  * `spec/manifest.yaml`

* The runtime must treat indexes as **cacheable**:

  * if index missing/outdated → rebuild or fallback to scanning rule files.

--- END FILE ---

--- FILE: spec/examples/README.md ---

# Examples

Rules stay compact and machine-enforceable; examples live separately to avoid bloating the rule registry.

## Goals

* Provide **concrete fixtures** for:

  * unit tests (lint, autofix, typeset transforms)
  * integration tests (render + QA gates)
  * documentation (human-readable “why this matters”)

* Keep examples **small** (a few lines) and **targeted** (each example triggers a known set of rules).

## Example ID format

`EX.<CATEGORY>.<TOPIC>.<NNN>`

* `CATEGORY` must match the category taxonomy (e.g., `PUNCTUATION`, `NUMBERS`, `CITATIONS`)
* `TOPIC` is an uppercase short slug
* `NNN` is a zero-padded integer (000–999+)

Example:

* `EX.PUNCTUATION.DASHES.001`

## Suggested on-disk layout

* `spec/examples/<category>/EX.<CATEGORY>.<TOPIC>.<NNN>.yaml`
* `spec/examples/<category>/fixtures/<name>.md` (optional)

## Example YAML format (recommended)

Fields:

* `id` (required): example ID
* `rules` (required): list of rule IDs the example is meant to exercise
* `before` (required): inline Markdown or a reference to a fixture file
* `after` (optional): expected Markdown after autofix (if autofix exists)
* `expected` (optional): expected diagnostics/gates

  * `lint_errors`: array of rule IDs expected as errors
  * `lint_warnings`: array of rule IDs expected as warnings
  * `qa_failures`: array of gate keys expected to fail
* `notes` (optional): short human explanation (no book quotes)

Minimal example skeleton:

* id: EX.PUNCTUATION.DASHES.001
  rules:

  * CMOS.PUNCTUATION.DASHES.EM_DASH
    before: |
    ...
    after: |
    ...
    expected:
    lint_errors: [CMOS.PUNCTUATION.DASHES.EM_DASH]

## Test corpus strategy

Maintain a small, curated corpus that triggers:

1. Lint-only issues (AST-level)

* punctuation spacing
* numeral formatting
* heading numbering patterns
* link normalization / unsafe URLs
* citation field completeness

2. Typeset-only issues (token/CSS decisions)

* paragraph indentation patterns
* code block wrapping rules
* table overflow strategies

3. Post-render QA issues (PDF/HTML layout)

* widows/orphans
* stranded headings (keep-with-next)
* overfull lines (especially monospace/code)
* table/caption overflow and clipping

Recommended corpus sizing:

* 30–80 fixtures total
* each fixture should target 3–10 rules max
* include “degraded mode” fixtures (intentionally malformed Markdown)

--- END FILE ---

--- FILE: app/ARCHITECTURE.md ---

# Runtime Architecture

This is a thin, deterministic runtime that:

A) ingests Markdown → normalizes a document AST → applies editorial lint (Chicago-derived)
B) applies typeset tokens/profile (Bringhurst-derived)
C) renders HTML and PDF deterministically
D) runs post-render QA gates (widows/orphans, heading keeps, overflow)
E) generates `layout-report.json` and fails builds when thresholds are exceeded

Primary reference PDFs provided to the system (for pointer-based rules and traceability only):

* The Chicago Manual of Style (18th ed).pdf
* Robert Bringhurst – The Elements of Typographic Style.pdf

No bulk transcription is performed; rules are paraphrases and cite sources only by pointer.

## Components

### 1) Registry Loader

Inputs:

* `spec/rules/**.ndjson` (Phase 2 output)
* `spec/schema/rule.schema.json`
* `spec/manifest.yaml`
* `spec/profiles/*.yaml`
* `spec/quality_gates.yaml`

Responsibilities:

* validate each rule against JSON Schema
* enforce ID uniqueness and stable sorting
* build or load indexes in `spec/indexes/*.json`
* compute coverage (implemented vs unimplemented; by enforcement)

Output (in-memory):

* `RuleStore` (rules + indexes + profile overrides + gate thresholds)

### 2) Markdown Ingest + AST Normalization

Steps:

1. Parse Markdown to an AST (mdast or equivalent).
2. Normalize to a stable internal schema:

   * heading levels and numbering metadata
   * lists and list tight/loose semantics
   * code spans/blocks with language tags
   * tables (GFM) to a consistent representation
   * links normalized (url, title, text)
   * citations normalized (if present as syntax/extensions)
3. Produce `normalized-doc.json` for debugging reproducibility.

Degraded mode:

* If parsing fails or structure is missing, switch to minimal node set and mark `structure_confidence: low`.
* Run the “degraded mode contract” from `spec/manifest.yaml`.

### 3) Editorial Lint Engine

What it does:

* Runs `lint`-enforced rules against normalized AST.
* Emits diagnostics:

  * `severity` (must/should/warn after profile overrides)
  * `rule_id`
  * location (source span) and node path
  * message (generated from rule metadata + implementation hints)

Autofix:

* If a rule’s `autofix` is `rewrite` or `suggest`, produce:

  * patched Markdown (rewrite) OR
  * suggestion blocks with exact spans to edit (suggest)
* Autofix must be deterministic and reversible (keep a patch log).

Artifacts:

* `lint-report.json`
* `lint-report.sarif` (optional for CI UIs)
* `lint-fixed.md` (optional, if autofix applied)

### 4) Typeset Profile Engine

Goal:

* Convert “typographic intent” into deterministic render inputs:

  * CSS tokens (variables)
  * layout policies (widows/orphans strategy, keeps, hyphenation params)
  * code/table overflow strategies

Inputs:

* normalized AST
* profile tokens from `spec/profiles/<profile>.yaml`

Outputs:

* `render.css` (tokenized CSS + paged-media rules)
* `render.html` (deterministic HTML with stable classnames/data attributes)
* `typeset-report.json` (what tokens were used, resolved font stack, measure targets)

Design principle:

* “Soft rules” are tokens; “hard rules” are lint/QA gates.

### 5) Deterministic Rendering

The runtime should treat rendering as an adapter layer.

Minimum requirement:

* Deterministic HTML generation (stable DOM order, stable IDs, stable whitespace).
* Deterministic PDF generation with pinned renderer/version and embedded fonts when possible.

Adapter concept:

* `RendererHTML`: emits HTML+CSS.
* `RendererPDF`: converts HTML+CSS to PDF using a configured engine.

Recommended renderer capabilities:

* CSS Paged Media support (page size/margins, running headers, footnotes if used)
* hyphenation dictionaries
* font embedding/subsetting

Artifacts:

* `out/<doc>.html`
* `out/<doc>.pdf`
* `out/render-log.json` (versions, timings, warnings)

### 6) Post-render QA Analyzer

Runs on:

* PDF (preferred for final layout truth) and optionally HTML.

Detects:

* widows/orphans (by paragraph line runs across pages)
* stranded headings (heading at bottom violating keep-with-next)
* overfull lines (glyph boxes exceed text block)
* table overflow/clipping
* code overflow/clipping
* link wrap incidents (URLs/DOIs split against policy)
* heading numbering errors (cross-check against AST numbering)
* citation format errors (cross-check against configured citation style)

Artifacts:

* `layout-report.json` (the canonical QA report)
* `qa-report.json` (gate evaluation + failures + excerpts as coordinates, not text)

Fail behavior:

* Compare measured metrics to `spec/quality_gates.yaml` for the chosen profile.
* Exit non-zero if any MUST-equivalent gate fails (or if `--strict` chosen, strict thresholds apply).

## Coverage Reporting and CI Guardrails

Coverage is computed from:

* total active rules
* rules with an implemented enforcement handler:

  * lint implemented if rule_id has an evaluator in lint engine
  * typeset implemented if token/policy exists and is applied deterministically
  * postrender implemented if analyzer has a detector for that rule/tag
  * manual implemented if checklist output includes it

Artifacts:

* `coverage-report.json` (counts by category, enforcement, severity, profile)
* `coverage-diff.json` (compares to baseline on main branch)

CI policy (from manifest):

* fail if MUST coverage drops
* fail if overall implemented coverage drops
* fail if rule IDs changed without deprecation mapping

## Assumptions (Phase 1 defaults)

1. The pipeline targets a CSS-based HTML→PDF engine (paged media capable).
2. Default language is `en` with optional `fr` fallback.
3. Citation style defaults to a Chicago-aligned style, but the registry will encode the exact variant in rules (Notes/Bibliography vs Author-Date) during extraction.
4. Fonts default to Noto/STIX families for broad coverage and consistent embedding, but can be overridden per profile.

No questions are strictly required to proceed with Phase 2 extraction; these assumptions can be adjusted via profiles and house rules.

--- END FILE ---

--- FILE: app/CLI_SPEC.md ---

# CLI Specification

The CLI is designed for CI use: deterministic outputs, stable exit codes, and JSON artifacts for tooling.

## Common flags (all commands)

* `--input <path>`: Markdown file or directory.
* `--out <dir>`: Output directory (default: `out/`).
* `--rules <dir>`: Rules root directory (default: `spec/rules/`).
* `--profile <name>`: One of: `web_pdf`, `print_pdf`, `dense_tech`, `memo`, `slide_deck`.
* `--strict`: Use strict thresholds in `spec/quality_gates.yaml`.
* `--format <json|sarif|text>`: Diagnostic output format (where applicable).
* `--fail-on <must|should|warn>`: Lowest severity that fails the command (default: `must`).
* `--degraded-ok`: Allow degraded mode without failing (still emits degraded-mode report).
* `--version`: Print tool + renderer versions.

## Command: `lint`

Purpose:

* Parse Markdown → normalize AST → run lint rules.
* Optionally apply autofixes.

Args:

* `--fix`: Apply autofix where `autofix != none` and safe.
* `--fix-mode <rewrite|suggest>`: Whether to rewrite output Markdown or emit suggestions only.
* `--baseline <path>`: Compare diagnostics to an existing lint report and show diff.

Outputs:

* `out/lint-report.json`
* `out/lint-report.sarif` (if `--format sarif`)
* `out/lint-fixed.md` (if `--fix` and `--fix-mode rewrite`)
* `out/manual-checklist.md` (includes manual rules tagged `manual_checklist=true`)

Exit codes:

* `0`: no failing diagnostics
* `1`: lint failures at or above `--fail-on`
* `4`: config/schema error
* `5`: internal error

## Command: `render-html`

Purpose:

* Generate deterministic HTML + CSS from normalized AST + profile tokens.

Args:

* `--emit-normalized`: also write `normalized-doc.json`
* `--assets <dir>`: static assets dir (images, fonts, etc.)
* `--self-contained`: embed assets in HTML where possible

Outputs:

* `out/render.html`
* `out/render.css`
* `out/typeset-report.json`
* `out/normalized-doc.json` (optional)

Exit codes:

* `0`: success
* `3`: render error
* `4`: config/schema error
* `5`: internal error

## Command: `render-pdf`

Purpose:

* Render PDF deterministically from HTML + CSS + assets.

Args:

* `--engine <name>`: renderer adapter selection (implementation-defined)
* `--engine-opts <json>`: pass-through engine options
* `--keep-html`: keep intermediate HTML/CSS even if PDF fails

Outputs:

* `out/render.pdf`
* `out/render.html` + `out/render.css` (always or if `--keep-html`)
* `out/render-log.json`

Exit codes:

* `0`: success
* `3`: render error
* `4`: config/schema error
* `5`: internal error

## Command: `qa`

Purpose:

* Run post-render QA analysis and evaluate quality gates.

Args:

* `--pdf <path>`: PDF to analyze (default: `out/render.pdf`)
* `--html <path>`: optional HTML for cross-checks
* `--gates <path>`: override gates file (default: `spec/quality_gates.yaml`)

Outputs:

* `out/layout-report.json`
* `out/qa-report.json`

Exit codes:

* `0`: all gates pass
* `2`: gates failed (at or above `--fail-on` / strictness)
* `4`: config/schema error
* `5`: internal error

## Command: `report`

Purpose:

* Produce a consolidated report:

  * coverage (implemented vs unimplemented)
  * diffs vs baseline
  * per-category enforcement breakdown

Args:

* `--baseline <path>`: baseline coverage report to diff against
* `--since <gitref>`: optionally compute diffs since a git ref (implementation-defined)

Outputs:

* `out/coverage-report.json`
* `out/coverage-diff.json` (if baseline provided)
* `out/coverage-summary.md`

Exit codes:

* `0`: report built and coverage passes configured floors
* `2`: coverage floor violated
* `4`: config/schema error
* `5`: internal error

--- END FILE ---

--- FILE: spec/extraction_plan.md ---

# Phase 2 Extraction Plan

This plan defines how rules will be produced in controlled batches without reproducing the books.

## Non-negotiables (carried into Phase 2)

* No full-book OCR/transcription.
* No long verbatim passages.
* Rules are paraphrased and capped (`rule_text` ≤ 800 chars).
* Every rule includes at least one source pointer in `source_refs`.
* If a rule depends on exact wording, the rule still paraphrases but must include:

  * `rule_text`: “Exact wording required—refer to pointer”
  * plus a usable pointer.

Primary reference PDFs for pointer extraction:

* The Chicago Manual of Style (18th ed).pdf
* Robert Bringhurst – The Elements of Typographic Style.pdf

## Output batching format

When you say: `EXTRACT <CATEGORY> [<SCOPE>]`

I will output a bundle that includes:

1. **Rules NDJSON** (150–250 rule records)

* Path: `spec/rules/<category>/<batch_id>.ndjson`
* One JSON object per line, validated against `spec/schema/rule.schema.json`.

2. **Index deltas** for that category

* `spec/indexes/keywords_<category>.json`
* `spec/indexes/source_refs_<category>.json`
* `spec/indexes/coverage_delta_<category>.json`

3. **Coverage notes** report

* A short Markdown report describing enforcement split:

  * lint vs typeset vs postrender vs manual
  * plus any known gaps or “manual-only” areas

## Batch naming

`<batch_id>` format:

* `v1_<category>_<nnn>`

  * e.g., `v1_punctuation_001`

Batches are append-only:

* If rules need revision, mark old rule `deprecated`, add a new rule ID (or new version segment) and keep both records.

## Pointer scheme details

Pointer strings live in `source_refs[]` and are **not** quotes.

Preferred pointer format:

* `CMOS18 §<section> p<book_page>`
* `BRING §<section> p<book_page>`
* Optional disambiguation: `(scan p<pdf_page_index>)`

Example pattern (not a quote):

* `CMOS18 §6.1 p377 (scan p10)`

Notes:

* “book_page” uses the printed page number in the book when present (arabic or roman).
* “scan p” uses the PDF page index when printed page numbers are ambiguous.

## Recommended extraction order (high-impact first)

1. numbers
2. punctuation
3. citations
4. headings
5. tables
6. figures
7. links
8. code
9. layout (widows/orphans, keeps, overflow)
10. front/back matter
11. accessibility
12. i18n

Rationale:

* Numbers/punctuation/citations most directly affect correctness, consistency, and auditability.
* Layout rules benefit from having structure and tokens in place.

## Scope parameter

`[<SCOPE>]` can constrain extraction, e.g.:

* `EXTRACT punctuation basic`
* `EXTRACT citations notes_bibliography`
* `EXTRACT numbers en_only`
* `EXTRACT layout widows_orphans`

If scope is omitted:

* extract the most generally applicable rules for that category first.

## Enforcement mapping guidelines (honest labeling)

* `lint`: detectable from AST or text normalization (e.g., spacing, punctuation patterns, citation fields).
* `typeset`: enforced via CSS/tokens/paged-media decisions (e.g., indent policy, measure targets, hyphenation params).
* `postrender`: requires layout inspection after rendering (e.g., widows/orphans, overfull lines, table clipping).
* `manual`: cannot be reliably automated; must include `tags: ["manual_checklist=true"]` and be emitted into checklist outputs.

If a concept spans multiple enforcement layers:

* Prefer splitting into two rules:

  * one lint rule (source cleanliness)
  * one postrender rule (layout outcome)
* Use `dependencies` to link them.

## Extraction workflow per category (repeatable)

For each category:

1. Build a topic map (subtopics, recurring failure modes).
2. Extract rules in clusters:

   * MUST rules first (enforceable or checklist)
   * SHOULD rules next
   * WARN rules last
3. For each rule:

   * add `keywords` for searchability
   * add tags for overrides/routing
   * add minimal `exceptions` when needed (avoid overfitting)

## “Degraded mode” considerations during extraction

For each category batch, include some rules that specifically target degraded inputs:

* hard-wrap repair suggestions
* heading inference warnings
* link sanitation and encoding fixes
* Unicode normalization notes

These rules should generally be `warn` or `should`, unless they prevent corruption (then `must`).

--- END FILE ---