forgejo-pdf/pub-style-skeleton.txt
2026-01-03 06:18:33 +00:00

1580 lines
40 KiB
Text
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

--- FILE: spec/schema/rule.schema.json ---
{
"$schema": "[https://json-schema.org/draft/2020-12/schema](https://json-schema.org/draft/2020-12/schema)",
"$id": "[https://example.invalid/pubstyle/spec/schema/rule.schema.json](https://example.invalid/pubstyle/spec/schema/rule.schema.json)",
"title": "Publication-quality rule record",
"type": "object",
"additionalProperties": false,
"required": [
"id",
"title",
"source_refs",
"category",
"severity",
"applies_to",
"rule_text",
"rationale",
"enforcement",
"autofix",
"autofix_notes",
"tags",
"keywords",
"dependencies",
"exceptions",
"status"
],
"properties": {
"id": {
"type": "string",
"description": "Stable rule identifier. Prefix must be one of CMOS, BRING, HOUSE.",
"minLength": 6,
"maxLength": 120,
"pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$"
},
"title": {
"type": "string",
"description": "Short human-readable rule title.",
"minLength": 4,
"maxLength": 160
},
"source_refs": {
"type": "array",
"description": "Pointers back to sources. Must be pointers, not quotes. Prefer: "CMOS18 §X.Y pN" / "BRING §X.Y pN" / "HOUSE §X.Y pN".",
"minItems": 1,
"items": {
"type": "string",
"minLength": 8,
"maxLength": 120,
"pattern": "^(CMOS18|BRING|HOUSE)\s§[0-9A-Za-z][0-9A-Za-z.\-]*\s p[0-9ivxlcdmIVXLCDM]+(?:-[0-9ivxlcdmIVXLCDM]+)?(?:\s\(scan p[0-9]+\))?$"
}
},
"category": {
"type": "string",
"description": "Primary taxonomy bucket.",
"enum": [
"editorial",
"typography",
"layout",
"headings",
"citations",
"numbers",
"punctuation",
"abbreviations",
"links",
"tables",
"figures",
"code",
"frontmatter",
"backmatter",
"accessibility",
"i18n"
]
},
"severity": {
"type": "string",
"description": "Normativity level. MUST blocks release unless downgraded by profile.",
"enum": ["must", "should", "warn"]
},
"applies_to": {
"type": "string",
"description": "Which pipeline stage(s) the rule targets.",
"enum": ["md", "html", "pdf", "all"]
},
"rule_text": {
"type": "string",
"description": "Paraphrased rule statement (no long quotes). If exact wording matters, note: "Exact wording required—refer to pointer".",
"minLength": 10,
"maxLength": 800
},
"rationale": {
"type": "string",
"description": "One-line rationale.",
"minLength": 5,
"maxLength": 200
},
"enforcement": {
"type": "string",
"description": "Primary enforcement mechanism.",
"enum": ["lint", "typeset", "postrender", "manual"]
},
"autofix": {
"type": "string",
"description": "Autofix capability, if any.",
"enum": ["none", "rewrite", "reflow", "suggest"]
},
"autofix_notes": {
"type": "string",
"description": "Notes describing what can be fixed and how/when. Keep short; never include book quotes.",
"maxLength": 400
},
"tags": {
"type": "array",
"description": "Compact labels for routing/search/overrides (e.g., 'manual_checklist=true', 'widows_orphans', 'hyphenation').",
"items": {
"type": "string",
"minLength": 1,
"maxLength": 48,
"pattern": "^[a-z0-9][a-z0-9_.:\-/]*(?:=[a-z0-9_.:\-/]+)?$"
},
"maxItems": 64
},
"keywords": {
"type": "array",
"description": "Search keywords (human-oriented; not necessarily normalized).",
"items": {
"type": "string",
"minLength": 2,
"maxLength": 48
},
"maxItems": 64
},
"dependencies": {
"type": "array",
"description": "Rule IDs that should be applied/understood first.",
"items": {
"type": "string",
"pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$"
},
"maxItems": 32
},
"exceptions": {
"type": "array",
"description": "Free-text exceptions/caveats. Keep concise.",
"items": {
"type": "string",
"minLength": 3,
"maxLength": 240
},
"maxItems": 32
},
"examples_ref": {
"type": "array",
"description": "Optional references to separately stored examples (see spec/examples/README.md).",
"items": {
"type": "string",
"minLength": 6,
"maxLength": 80,
"pattern": "^EX\.[A-Z0-9_]+\.[A-Z0-9_]+\.[0-9]{3,}$"
},
"maxItems": 64
},
"implementation_notes": {
"type": "string",
"description": "Optional short notes for implementers (no quotes).",
"minLength": 3,
"maxLength": 600
},
"status": {
"type": "string",
"description": "Lifecycle state.",
"enum": ["draft", "active", "deprecated"]
}
},
"allOf": [
{
"if": {
"properties": {
"autofix": { "enum": ["rewrite", "reflow", "suggest"] }
},
"required": ["autofix"]
},
"then": {
"properties": {
"autofix_notes": { "minLength": 1 }
}
}
}
]
}
--- END FILE ---
--- FILE: spec/manifest.yaml ---
version: "0.1.0"
registry_id: "pubstyle"
description: >
Machine-readable style+typesetting rules for a Markdown→HTML→PDF pipeline,
backed by primary references (Chicago / Bringhurst) and optional house rules.
Rules are paraphrases only; sources are referenced by pointer strings.
id_naming:
prefixes:
CMOS: "Editorial/style usage rules derived primarily from Chicago."
BRING: "Typographic/layout rules derived primarily from Bringhurst."
HOUSE: "Project-specific rules not directly sourced to Chicago/Bringhurst."
pattern: "PREFIX.DOMAIN.TOPIC[.SUBTOPIC[.DETAIL...]]"
delimiter: "."
casing: "UPPER_SNAKE for segments"
stability:
rule_ids_are_immutable: true
rename_policy: "Deprecate old id; introduce new id; keep mapping in report diffs."
examples:
- "CMOS.PUNCTUATION.DASHES.EM_DASH"
- "BRING.LAYOUT.WIDOWS_ORPHANS.AVOID"
- "HOUSE.CITATIONS.DOI.PREFER_HTTPS"
source_pointer_scheme:
goal: "Provide auditable traceability without reproducing sources."
pointer_format_primary: "CMOS18 §<section> p<book_page>"
pointer_format_secondary: "BRING §<section> p<book_page>"
pointer_format_house: "HOUSE §<section> p<doc_page>"
optional_scan_hint: "(scan p<pdf_page_index>)"
allowed_page_numbering: ["arabic", "roman"]
notes:
- "Pointers must be sufficient for a reader with the book to locate the guidance."
- "Never store verbatim passages; paraphrase only."
- "If a rule depends on exact wording, rule_text must say: 'Exact wording required—refer to pointer'."
category_taxonomy:
* editorial
* typography
* layout
* headings
* citations
* numbers
* punctuation
* abbreviations
* links
* tables
* figures
* code
* frontmatter
* backmatter
* accessibility
* i18n
profiles:
* web_pdf
* print_pdf
* dense_tech
* memo
* slide_deck
planned_rule_counts:
target_total_range: [800, 1500]
target_by_category:
editorial: 120
typography: 170
layout: 140
headings: 70
citations: 140
numbers: 90
punctuation: 120
abbreviations: 60
links: 50
tables: 60
figures: 50
code: 70
frontmatter: 40
backmatter: 40
accessibility: 90
i18n: 60
coverage_contract:
must_rules:
enforceability_requirement: >
Every MUST rule must be enforceable by at least one of: lint, typeset, postrender;
otherwise it must be explicitly labeled as a manual checklist item and emitted in
a checklist output artifact.
manual_checklist_tag: "manual_checklist=true"
checklist_artifact: "manual-checklist.md (and JSON mirror)"
should_rules:
policy: "Should rules should be enforceable when practical; otherwise allowed as manual with explicit rationale."
warn_rules:
policy: "Warnings may be non-blocking and advisory; still require source pointers."
enforcement_definitions:
lint: "Static analysis over normalized Markdown/HTML AST. Deterministic."
typeset: "CSS/tokens shaping decisions prior to rendering (pagination, keeps, hyphenation parameters)."
postrender: "PDF/HTML layout inspection (widows/orphans, overflow, keep failures, numbering mismatches)."
manual: "Human review; system must still produce checklist items and traceability pointers."
ci_guardrails:
coverage_floor:
must_implemented_min_percent: 95
overall_implemented_min_percent: 80
regression_rule: "CI fails if implemented coverage decreases from main branch."
degraded_mode_contract:
purpose: "Handle badly-structured inputs safely without crashing; still provide useful output."
triggers:
- "Markdown parse errors / invalid UTF-8"
- "Missing heading hierarchy (no H1/H2 etc.)"
- "Garbage extraction (e.g., line breaks every word, excessive hard wraps)"
- "Mixed language with no lang metadata"
behavior:
normalize:
attempt_repairs:
- "Normalize whitespace and line endings"
- "Detect and unwrap hard-wrapped paragraphs heuristically"
- "Infer heading levels from patterns (e.g., '1.', '1.1', ALL CAPS lines) with low confidence"
if_unrecoverable:
- "Fall back to minimal AST: paragraphs + code blocks + raw spans"
- "Mark document structure confidence = low"
enforcement_in_degraded_mode:
lint:
run_subset: ["safety", "sanity", "catastrophic typography (double spaces, broken links)"]
downgrade_some_must_to_warn: true
typeset:
use_fallback_tokens: true
disable_aggressive_hyphenation: true
postrender:
run_core_gates_only: ["overfull_lines", "table_overflow_incidents", "code_overflow_incidents"]
reporting:
always_emit:
- "layout-report.json"
- "coverage-report.json"
- "degraded-mode-report.json (what was inferred and why)"
--- END FILE ---
--- FILE: spec/profiles/web_pdf.yaml ---
profile_id: "web_pdf"
description: "Screen-first PDF for sharing and reading; conservative pagination and strong accessibility defaults."
page:
size: "Letter"
orientation: "portrait"
two_sided: false
margins:
top: "22mm"
bottom: "22mm"
inner: "20mm"
outer: "20mm"
fonts:
body:
family: ["Noto Serif", "STIX Two Text", "Times New Roman", "serif"]
size: "11pt"
line_height: 1.45
heading:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"]
size: "10pt"
line_height: 1.35
measure_targets:
columns: 1
body_chars_per_line:
min: 55
ideal: 66
max: 75
footnote_chars_per_line:
min: 50
ideal: 60
max: 70
hyphenation:
enabled: true
strategy: "balanced"
language_driven: true
min_left: 2
min_right: 3
max_consecutive_hyphenated_lines: 2
avoid_proper_names_when_possible: true
avoid_after_short_lines: true
paragraphs:
first_paragraph_indent: "0"
indent: "1em"
block_paragraph_spacing: "0.6em"
headings:
keep_with_next_lines: 2
avoid_stranded_headings: true
numbering:
enabled: true
style: "decimal"
require_monotonic_increase: true
widows_orphans:
widow_lines: 2
orphan_lines: 2
balance_facing_pages: false
code:
inline:
use_mono: true
block:
font_size: "9.5pt"
line_height: 1.35
wrap: true
max_wrap_penalty: "medium"
overflow_policy: "wrap_then_shrink_minor"
shrink_limit: 0.92
tables:
cell_padding: "3pt 6pt"
header_repeat: true
overflow_policy: "shrink_then_wrap"
shrink_limit: 0.9
severity_overrides:
* selector: { category: "layout", tag: "widows_orphans" }
severity: "should"
* selector: { category: "accessibility" }
severity: "must"
locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us" # curly quotes, US punctuation conventions
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---
--- FILE: spec/profiles/print_pdf.yaml ---
profile_id: "print_pdf"
description: "Print-oriented PDF with stricter pagination, book-like rhythm, and stronger keep constraints."
page:
size: "6in×9in"
orientation: "portrait"
two_sided: true
margins:
top: "18mm"
bottom: "20mm"
inner: "22mm"
outer: "18mm"
fonts:
body:
family: ["STIX Two Text", "Noto Serif", "Georgia", "serif"]
size: "10.5pt"
line_height: 1.50
heading:
family: ["STIX Two Text", "Noto Serif", "serif"]
mono:
family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"]
size: "9.5pt"
line_height: 1.30
measure_targets:
columns: 1
body_chars_per_line:
min: 55
ideal: 66
max: 72
hyphenation:
enabled: true
strategy: "print_quality"
min_left: 2
min_right: 3
max_consecutive_hyphenated_lines: 2
avoid_proper_names_when_possible: true
paragraphs:
first_paragraph_indent: "0"
indent: "1em"
headings:
keep_with_next_lines: 3
avoid_stranded_headings: true
numbering:
enabled: true
style: "decimal"
require_monotonic_increase: true
widows_orphans:
widow_lines: 2
orphan_lines: 2
balance_facing_pages: true
code:
block:
font_size: "9pt"
line_height: 1.25
wrap: false
overflow_policy: "shrink_then_scroll_indicator"
shrink_limit: 0.90
tables:
cell_padding: "2.5pt 5pt"
header_repeat: true
overflow_policy: "shrink_then_rotate_if_allowed"
shrink_limit: 0.88
severity_overrides:
* selector: { category: "layout", tag: "widows_orphans" }
severity: "must"
* selector: { category: "layout", tag: "keep_constraints" }
severity: "must"
* selector: { category: "typography", tag: "spacing_consistency" }
severity: "must"
locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "Month D, YYYY"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---
--- FILE: spec/profiles/dense_tech.yaml ---
profile_id: "dense_tech"
description: "Technical papers and specs: denser copy, more code/table tolerance, strict numbering and citations."
page:
size: "A4"
orientation: "portrait"
two_sided: false
margins:
top: "18mm"
bottom: "18mm"
inner: "18mm"
outer: "18mm"
fonts:
body:
family: ["Noto Serif", "STIX Two Text", "serif"]
size: "10pt"
line_height: 1.35
heading:
family: ["Noto Sans", "Source Sans 3", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Source Code Pro", "monospace"]
size: "9pt"
line_height: 1.25
measure_targets:
columns: 1
body_chars_per_line:
min: 65
ideal: 75
max: 90
hyphenation:
enabled: true
strategy: "balanced"
min_left: 2
min_right: 3
max_consecutive_hyphenated_lines: 3
avoid_proper_names_when_possible: true
headings:
keep_with_next_lines: 2
avoid_stranded_headings: true
numbering:
enabled: true
style: "decimal"
require_monotonic_increase: true
widows_orphans:
widow_lines: 2
orphan_lines: 2
balance_facing_pages: false
code:
block:
font_size: "8.8pt"
line_height: 1.20
wrap: true
overflow_policy: "wrap_then_shrink_minor"
shrink_limit: 0.90
tables:
cell_padding: "2pt 4pt"
header_repeat: true
overflow_policy: "shrink_then_wrap"
shrink_limit: 0.85
severity_overrides:
* selector: { category: "citations" }
severity: "must"
* selector: { category: "headings", tag: "numbering" }
severity: "must"
* selector: { category: "layout", tag: "widows_orphans" }
severity: "should"
locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---
--- FILE: spec/profiles/memo.yaml ---
profile_id: "memo"
description: "Short internal documents: lenient pagination, strong clarity, minimal typographic complexity."
page:
size: "Letter"
orientation: "portrait"
two_sided: false
margins:
top: "1in"
bottom: "1in"
inner: "1in"
outer: "1in"
fonts:
body:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
size: "11pt"
line_height: 1.40
heading:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Consolas", "monospace"]
size: "10pt"
line_height: 1.30
measure_targets:
columns: 1
body_chars_per_line:
min: 55
ideal: 70
max: 85
hyphenation:
enabled: false
strategy: "off_for_memos"
headings:
keep_with_next_lines: 2
avoid_stranded_headings: true
numbering:
enabled: false
widows_orphans:
widow_lines: 1
orphan_lines: 1
balance_facing_pages: false
code:
block:
font_size: "9.5pt"
line_height: 1.25
wrap: true
overflow_policy: "wrap"
shrink_limit: 1.0
tables:
cell_padding: "3pt 6pt"
header_repeat: false
overflow_policy: "wrap"
shrink_limit: 1.0
severity_overrides:
* selector: { category: "layout", tag: "widows_orphans" }
severity: "warn"
* selector: { category: "accessibility" }
severity: "must"
locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---
--- FILE: spec/profiles/slide_deck.yaml ---
profile_id: "slide_deck"
description: "Paged slides (16:9). Emphasis on hierarchy, short lines, and avoiding overflows."
page:
size: "13.333in×7.5in" # 16:9 at common PPT dimensions
orientation: "landscape"
two_sided: false
margins:
top: "0.5in"
bottom: "0.5in"
inner: "0.6in"
outer: "0.6in"
fonts:
body:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
size: "24pt"
line_height: 1.15
heading:
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
mono:
family: ["Noto Sans Mono", "Consolas", "monospace"]
size: "20pt"
line_height: 1.10
measure_targets:
columns: 1
body_chars_per_line:
min: 25
ideal: 40
max: 55
hyphenation:
enabled: false
strategy: "off_for_slides"
headings:
keep_with_next_lines: 1
avoid_stranded_headings: true
numbering:
enabled: false
widows_orphans:
widow_lines: 1
orphan_lines: 1
balance_facing_pages: false
code:
block:
font_size: "18pt"
line_height: 1.10
wrap: true
overflow_policy: "wrap_then_shrink_minor"
shrink_limit: 0.92
tables:
cell_padding: "6pt 10pt"
header_repeat: false
overflow_policy: "shrink_then_wrap"
shrink_limit: 0.88
severity_overrides:
* selector: { category: "layout", tag: "overflow" }
severity: "must"
* selector: { category: "accessibility" }
severity: "must"
locale_defaults:
primary_language: "en"
fallback_languages: ["fr"]
quotation_style: "us"
date_format: "YYYY-MM-DD"
number_format:
decimal_separator: "."
thousands_separator: ","
--- END FILE ---
--- FILE: spec/quality_gates.yaml ---
version: "0.1.0"
description: >
Post-render QA gates. All thresholds are hard numeric limits used to fail builds
(unless a gate is explicitly marked as "warn-only" by the invoking CLI flags).
metrics:
max_widows_per_10_pages: "Count of widow lines across any 10 consecutive pages."
max_orphans_per_10_pages: "Count of orphan lines across any 10 consecutive pages."
max_stranded_headings: "Count of headings at page bottom with insufficient following content per keep rule."
max_overfull_lines: "Count of lines exceeding measure by overflow threshold (render-time measured)."
max_table_overflow_incidents: "Count of tables that overflow page/column bounds or are clipped."
max_code_overflow_incidents: "Count of code blocks with horizontal overflow or clipping."
max_link_wrap_incidents: "Count of wrapped URLs/DOIs/emails violating link wrap policy."
max_heading_numbering_errors: "Count of numbering sequence/format violations."
max_citation_format_errors: "Count of citations not matching configured style format."
overflow_detection:
overfull_line_threshold_css_px: 1.0
consider_clipping_as_overflow: true
ignore_decorative_elements: true
profiles:
web_pdf:
default:
max_widows_per_10_pages: 1
max_orphans_per_10_pages: 1
max_stranded_headings: 0
max_overfull_lines: 2
max_table_overflow_incidents: 0
max_code_overflow_incidents: 1
max_link_wrap_incidents: 2
max_heading_numbering_errors: 0
max_citation_format_errors: 0
strict:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0
print_pdf:
default:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0
strict:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0
dense_tech:
default:
max_widows_per_10_pages: 1
max_orphans_per_10_pages: 1
max_stranded_headings: 0
max_overfull_lines: 3
max_table_overflow_incidents: 1
max_code_overflow_incidents: 2
max_link_wrap_incidents: 3
max_heading_numbering_errors: 0
max_citation_format_errors: 0
strict:
max_widows_per_10_pages: 0
max_orphans_per_10_pages: 0
max_stranded_headings: 0
max_overfull_lines: 1
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 1
max_heading_numbering_errors: 0
max_citation_format_errors: 0
memo:
default:
max_widows_per_10_pages: 3
max_orphans_per_10_pages: 3
max_stranded_headings: 0
max_overfull_lines: 2
max_table_overflow_incidents: 1
max_code_overflow_incidents: 1
max_link_wrap_incidents: 4
max_heading_numbering_errors: 1
max_citation_format_errors: 1
strict:
max_widows_per_10_pages: 1
max_orphans_per_10_pages: 1
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 2
max_heading_numbering_errors: 0
max_citation_format_errors: 0
slide_deck:
default:
max_widows_per_10_pages: 5
max_orphans_per_10_pages: 5
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 1
strict:
max_widows_per_10_pages: 2
max_orphans_per_10_pages: 2
max_stranded_headings: 0
max_overfull_lines: 0
max_table_overflow_incidents: 0
max_code_overflow_incidents: 0
max_link_wrap_incidents: 0
max_heading_numbering_errors: 0
max_citation_format_errors: 0
--- END FILE ---
--- FILE: spec/indexes/README.md ---
# Indexes
This project builds small, fast indexes so the runtime can answer questions like:
* “Which rules mention *en dash*?”
* “Which rules cite *CMOS18 §6.88 p412*?”
* “Which rules apply to `postrender` QA?”
* “What rules are overridden by the `print_pdf` profile?”
Indexes are derived artifacts (rebuildable) and should not be hand-edited.
## Indexes the app will build
### 1) keyword → rule IDs
**Purpose:** fast search/autocomplete and lint explanations.
* **Path:** `spec/indexes/keywords_all.json` and per-category deltas:
* `spec/indexes/keywords_<category>.json`
* **Format (JSON):**
* keys: normalized keyword (lowercased)
* values: array of rule IDs sorted stable (lexicographic)
Normalization (default):
* Unicode NFKC
* lowercase
* collapse whitespace
* strip surrounding punctuation
### 2) source_ref → rule IDs
**Purpose:** audit trail back to references without embedding book text.
* **Path:** `spec/indexes/source_refs_all.json` and per-category deltas:
* `spec/indexes/source_refs_<category>.json`
* **Format (JSON):**
* keys: exact `source_ref` pointer strings
* values: array of rule IDs
### 3) category → rule IDs
**Purpose:** batch reporting, extraction coverage, profile scoping.
* **Path:** `spec/indexes/category.json`
* **Format (JSON):**
* keys: category name
* values: array of rule IDs
### 4) enforcement → rule IDs
**Purpose:** quickly decide which engine (lint/typeset/postrender/manual) handles which rules.
* **Path:** `spec/indexes/enforcement.json`
### 5) profile overrides
**Purpose:** allow profiles to override severity or token parameters without editing rules.
* **Path:** `spec/indexes/profile_overrides.json`
* **Format (JSON):**
* per profile: list of override objects (selector + action)
* selectors may match category, tags, applies_to, or explicit rule IDs
## Build guarantees
* Index builds are deterministic from:
* `spec/rules/**.ndjson`
* `spec/profiles/*.yaml`
* `spec/manifest.yaml`
* The runtime must treat indexes as **cacheable**:
* if index missing/outdated → rebuild or fallback to scanning rule files.
--- END FILE ---
--- FILE: spec/examples/README.md ---
# Examples
Rules stay compact and machine-enforceable; examples live separately to avoid bloating the rule registry.
## Goals
* Provide **concrete fixtures** for:
* unit tests (lint, autofix, typeset transforms)
* integration tests (render + QA gates)
* documentation (human-readable “why this matters”)
* Keep examples **small** (a few lines) and **targeted** (each example triggers a known set of rules).
## Example ID format
`EX.<CATEGORY>.<TOPIC>.<NNN>`
* `CATEGORY` must match the category taxonomy (e.g., `PUNCTUATION`, `NUMBERS`, `CITATIONS`)
* `TOPIC` is an uppercase short slug
* `NNN` is a zero-padded integer (000999+)
Example:
* `EX.PUNCTUATION.DASHES.001`
## Suggested on-disk layout
* `spec/examples/<category>/EX.<CATEGORY>.<TOPIC>.<NNN>.yaml`
* `spec/examples/<category>/fixtures/<name>.md` (optional)
## Example YAML format (recommended)
Fields:
* `id` (required): example ID
* `rules` (required): list of rule IDs the example is meant to exercise
* `before` (required): inline Markdown or a reference to a fixture file
* `after` (optional): expected Markdown after autofix (if autofix exists)
* `expected` (optional): expected diagnostics/gates
* `lint_errors`: array of rule IDs expected as errors
* `lint_warnings`: array of rule IDs expected as warnings
* `qa_failures`: array of gate keys expected to fail
* `notes` (optional): short human explanation (no book quotes)
Minimal example skeleton:
* id: EX.PUNCTUATION.DASHES.001
rules:
* CMOS.PUNCTUATION.DASHES.EM_DASH
before: |
...
after: |
...
expected:
lint_errors: [CMOS.PUNCTUATION.DASHES.EM_DASH]
## Test corpus strategy
Maintain a small, curated corpus that triggers:
1. Lint-only issues (AST-level)
* punctuation spacing
* numeral formatting
* heading numbering patterns
* link normalization / unsafe URLs
* citation field completeness
2. Typeset-only issues (token/CSS decisions)
* paragraph indentation patterns
* code block wrapping rules
* table overflow strategies
3. Post-render QA issues (PDF/HTML layout)
* widows/orphans
* stranded headings (keep-with-next)
* overfull lines (especially monospace/code)
* table/caption overflow and clipping
Recommended corpus sizing:
* 3080 fixtures total
* each fixture should target 310 rules max
* include “degraded mode” fixtures (intentionally malformed Markdown)
--- END FILE ---
--- FILE: app/ARCHITECTURE.md ---
# Runtime Architecture
This is a thin, deterministic runtime that:
A) ingests Markdown → normalizes a document AST → applies editorial lint (Chicago-derived)
B) applies typeset tokens/profile (Bringhurst-derived)
C) renders HTML and PDF deterministically
D) runs post-render QA gates (widows/orphans, heading keeps, overflow)
E) generates `layout-report.json` and fails builds when thresholds are exceeded
Primary reference PDFs provided to the system (for pointer-based rules and traceability only):
* The Chicago Manual of Style (18th ed).pdf
* Robert Bringhurst The Elements of Typographic Style.pdf
No bulk transcription is performed; rules are paraphrases and cite sources only by pointer.
## Components
### 1) Registry Loader
Inputs:
* `spec/rules/**.ndjson` (Phase 2 output)
* `spec/schema/rule.schema.json`
* `spec/manifest.yaml`
* `spec/profiles/*.yaml`
* `spec/quality_gates.yaml`
Responsibilities:
* validate each rule against JSON Schema
* enforce ID uniqueness and stable sorting
* build or load indexes in `spec/indexes/*.json`
* compute coverage (implemented vs unimplemented; by enforcement)
Output (in-memory):
* `RuleStore` (rules + indexes + profile overrides + gate thresholds)
### 2) Markdown Ingest + AST Normalization
Steps:
1. Parse Markdown to an AST (mdast or equivalent).
2. Normalize to a stable internal schema:
* heading levels and numbering metadata
* lists and list tight/loose semantics
* code spans/blocks with language tags
* tables (GFM) to a consistent representation
* links normalized (url, title, text)
* citations normalized (if present as syntax/extensions)
3. Produce `normalized-doc.json` for debugging reproducibility.
Degraded mode:
* If parsing fails or structure is missing, switch to minimal node set and mark `structure_confidence: low`.
* Run the “degraded mode contract” from `spec/manifest.yaml`.
### 3) Editorial Lint Engine
What it does:
* Runs `lint`-enforced rules against normalized AST.
* Emits diagnostics:
* `severity` (must/should/warn after profile overrides)
* `rule_id`
* location (source span) and node path
* message (generated from rule metadata + implementation hints)
Autofix:
* If a rules `autofix` is `rewrite` or `suggest`, produce:
* patched Markdown (rewrite) OR
* suggestion blocks with exact spans to edit (suggest)
* Autofix must be deterministic and reversible (keep a patch log).
Artifacts:
* `lint-report.json`
* `lint-report.sarif` (optional for CI UIs)
* `lint-fixed.md` (optional, if autofix applied)
### 4) Typeset Profile Engine
Goal:
* Convert “typographic intent” into deterministic render inputs:
* CSS tokens (variables)
* layout policies (widows/orphans strategy, keeps, hyphenation params)
* code/table overflow strategies
Inputs:
* normalized AST
* profile tokens from `spec/profiles/<profile>.yaml`
Outputs:
* `render.css` (tokenized CSS + paged-media rules)
* `render.html` (deterministic HTML with stable classnames/data attributes)
* `typeset-report.json` (what tokens were used, resolved font stack, measure targets)
Design principle:
* “Soft rules” are tokens; “hard rules” are lint/QA gates.
### 5) Deterministic Rendering
The runtime should treat rendering as an adapter layer.
Minimum requirement:
* Deterministic HTML generation (stable DOM order, stable IDs, stable whitespace).
* Deterministic PDF generation with pinned renderer/version and embedded fonts when possible.
Adapter concept:
* `RendererHTML`: emits HTML+CSS.
* `RendererPDF`: converts HTML+CSS to PDF using a configured engine.
Recommended renderer capabilities:
* CSS Paged Media support (page size/margins, running headers, footnotes if used)
* hyphenation dictionaries
* font embedding/subsetting
Artifacts:
* `out/<doc>.html`
* `out/<doc>.pdf`
* `out/render-log.json` (versions, timings, warnings)
### 6) Post-render QA Analyzer
Runs on:
* PDF (preferred for final layout truth) and optionally HTML.
Detects:
* widows/orphans (by paragraph line runs across pages)
* stranded headings (heading at bottom violating keep-with-next)
* overfull lines (glyph boxes exceed text block)
* table overflow/clipping
* code overflow/clipping
* link wrap incidents (URLs/DOIs split against policy)
* heading numbering errors (cross-check against AST numbering)
* citation format errors (cross-check against configured citation style)
Artifacts:
* `layout-report.json` (the canonical QA report)
* `qa-report.json` (gate evaluation + failures + excerpts as coordinates, not text)
Fail behavior:
* Compare measured metrics to `spec/quality_gates.yaml` for the chosen profile.
* Exit non-zero if any MUST-equivalent gate fails (or if `--strict` chosen, strict thresholds apply).
## Coverage Reporting and CI Guardrails
Coverage is computed from:
* total active rules
* rules with an implemented enforcement handler:
* lint implemented if rule_id has an evaluator in lint engine
* typeset implemented if token/policy exists and is applied deterministically
* postrender implemented if analyzer has a detector for that rule/tag
* manual implemented if checklist output includes it
Artifacts:
* `coverage-report.json` (counts by category, enforcement, severity, profile)
* `coverage-diff.json` (compares to baseline on main branch)
CI policy (from manifest):
* fail if MUST coverage drops
* fail if overall implemented coverage drops
* fail if rule IDs changed without deprecation mapping
## Assumptions (Phase 1 defaults)
1. The pipeline targets a CSS-based HTML→PDF engine (paged media capable).
2. Default language is `en` with optional `fr` fallback.
3. Citation style defaults to a Chicago-aligned style, but the registry will encode the exact variant in rules (Notes/Bibliography vs Author-Date) during extraction.
4. Fonts default to Noto/STIX families for broad coverage and consistent embedding, but can be overridden per profile.
No questions are strictly required to proceed with Phase 2 extraction; these assumptions can be adjusted via profiles and house rules.
--- END FILE ---
--- FILE: app/CLI_SPEC.md ---
# CLI Specification
The CLI is designed for CI use: deterministic outputs, stable exit codes, and JSON artifacts for tooling.
## Common flags (all commands)
* `--input <path>`: Markdown file or directory.
* `--out <dir>`: Output directory (default: `out/`).
* `--rules <dir>`: Rules root directory (default: `spec/rules/`).
* `--profile <name>`: One of: `web_pdf`, `print_pdf`, `dense_tech`, `memo`, `slide_deck`.
* `--strict`: Use strict thresholds in `spec/quality_gates.yaml`.
* `--format <json|sarif|text>`: Diagnostic output format (where applicable).
* `--fail-on <must|should|warn>`: Lowest severity that fails the command (default: `must`).
* `--degraded-ok`: Allow degraded mode without failing (still emits degraded-mode report).
* `--version`: Print tool + renderer versions.
## Command: `lint`
Purpose:
* Parse Markdown → normalize AST → run lint rules.
* Optionally apply autofixes.
Args:
* `--fix`: Apply autofix where `autofix != none` and safe.
* `--fix-mode <rewrite|suggest>`: Whether to rewrite output Markdown or emit suggestions only.
* `--baseline <path>`: Compare diagnostics to an existing lint report and show diff.
Outputs:
* `out/lint-report.json`
* `out/lint-report.sarif` (if `--format sarif`)
* `out/lint-fixed.md` (if `--fix` and `--fix-mode rewrite`)
* `out/manual-checklist.md` (includes manual rules tagged `manual_checklist=true`)
Exit codes:
* `0`: no failing diagnostics
* `1`: lint failures at or above `--fail-on`
* `4`: config/schema error
* `5`: internal error
## Command: `render-html`
Purpose:
* Generate deterministic HTML + CSS from normalized AST + profile tokens.
Args:
* `--emit-normalized`: also write `normalized-doc.json`
* `--assets <dir>`: static assets dir (images, fonts, etc.)
* `--self-contained`: embed assets in HTML where possible
Outputs:
* `out/render.html`
* `out/render.css`
* `out/typeset-report.json`
* `out/normalized-doc.json` (optional)
Exit codes:
* `0`: success
* `3`: render error
* `4`: config/schema error
* `5`: internal error
## Command: `render-pdf`
Purpose:
* Render PDF deterministically from HTML + CSS + assets.
Args:
* `--engine <name>`: renderer adapter selection (implementation-defined)
* `--engine-opts <json>`: pass-through engine options
* `--keep-html`: keep intermediate HTML/CSS even if PDF fails
Outputs:
* `out/render.pdf`
* `out/render.html` + `out/render.css` (always or if `--keep-html`)
* `out/render-log.json`
Exit codes:
* `0`: success
* `3`: render error
* `4`: config/schema error
* `5`: internal error
## Command: `qa`
Purpose:
* Run post-render QA analysis and evaluate quality gates.
Args:
* `--pdf <path>`: PDF to analyze (default: `out/render.pdf`)
* `--html <path>`: optional HTML for cross-checks
* `--gates <path>`: override gates file (default: `spec/quality_gates.yaml`)
Outputs:
* `out/layout-report.json`
* `out/qa-report.json`
Exit codes:
* `0`: all gates pass
* `2`: gates failed (at or above `--fail-on` / strictness)
* `4`: config/schema error
* `5`: internal error
## Command: `report`
Purpose:
* Produce a consolidated report:
* coverage (implemented vs unimplemented)
* diffs vs baseline
* per-category enforcement breakdown
Args:
* `--baseline <path>`: baseline coverage report to diff against
* `--since <gitref>`: optionally compute diffs since a git ref (implementation-defined)
Outputs:
* `out/coverage-report.json`
* `out/coverage-diff.json` (if baseline provided)
* `out/coverage-summary.md`
Exit codes:
* `0`: report built and coverage passes configured floors
* `2`: coverage floor violated
* `4`: config/schema error
* `5`: internal error
--- END FILE ---
--- FILE: spec/extraction_plan.md ---
# Phase 2 Extraction Plan
This plan defines how rules will be produced in controlled batches without reproducing the books.
## Non-negotiables (carried into Phase 2)
* No full-book OCR/transcription.
* No long verbatim passages.
* Rules are paraphrased and capped (`rule_text` ≤ 800 chars).
* Every rule includes at least one source pointer in `source_refs`.
* If a rule depends on exact wording, the rule still paraphrases but must include:
* `rule_text`: “Exact wording required—refer to pointer”
* plus a usable pointer.
Primary reference PDFs for pointer extraction:
* The Chicago Manual of Style (18th ed).pdf
* Robert Bringhurst The Elements of Typographic Style.pdf
## Output batching format
When you say: `EXTRACT <CATEGORY> [<SCOPE>]`
I will output a bundle that includes:
1. **Rules NDJSON** (150250 rule records)
* Path: `spec/rules/<category>/<batch_id>.ndjson`
* One JSON object per line, validated against `spec/schema/rule.schema.json`.
2. **Index deltas** for that category
* `spec/indexes/keywords_<category>.json`
* `spec/indexes/source_refs_<category>.json`
* `spec/indexes/coverage_delta_<category>.json`
3. **Coverage notes** report
* A short Markdown report describing enforcement split:
* lint vs typeset vs postrender vs manual
* plus any known gaps or “manual-only” areas
## Batch naming
`<batch_id>` format:
* `v1_<category>_<nnn>`
* e.g., `v1_punctuation_001`
Batches are append-only:
* If rules need revision, mark old rule `deprecated`, add a new rule ID (or new version segment) and keep both records.
## Pointer scheme details
Pointer strings live in `source_refs[]` and are **not** quotes.
Preferred pointer format:
* `CMOS18 §<section> p<book_page>`
* `BRING §<section> p<book_page>`
* Optional disambiguation: `(scan p<pdf_page_index>)`
Example pattern (not a quote):
* `CMOS18 §6.1 p377 (scan p10)`
Notes:
* “book_page” uses the printed page number in the book when present (arabic or roman).
* “scan p” uses the PDF page index when printed page numbers are ambiguous.
## Recommended extraction order (high-impact first)
1. numbers
2. punctuation
3. citations
4. headings
5. tables
6. figures
7. links
8. code
9. layout (widows/orphans, keeps, overflow)
10. front/back matter
11. accessibility
12. i18n
Rationale:
* Numbers/punctuation/citations most directly affect correctness, consistency, and auditability.
* Layout rules benefit from having structure and tokens in place.
## Scope parameter
`[<SCOPE>]` can constrain extraction, e.g.:
* `EXTRACT punctuation basic`
* `EXTRACT citations notes_bibliography`
* `EXTRACT numbers en_only`
* `EXTRACT layout widows_orphans`
If scope is omitted:
* extract the most generally applicable rules for that category first.
## Enforcement mapping guidelines (honest labeling)
* `lint`: detectable from AST or text normalization (e.g., spacing, punctuation patterns, citation fields).
* `typeset`: enforced via CSS/tokens/paged-media decisions (e.g., indent policy, measure targets, hyphenation params).
* `postrender`: requires layout inspection after rendering (e.g., widows/orphans, overfull lines, table clipping).
* `manual`: cannot be reliably automated; must include `tags: ["manual_checklist=true"]` and be emitted into checklist outputs.
If a concept spans multiple enforcement layers:
* Prefer splitting into two rules:
* one lint rule (source cleanliness)
* one postrender rule (layout outcome)
* Use `dependencies` to link them.
## Extraction workflow per category (repeatable)
For each category:
1. Build a topic map (subtopics, recurring failure modes).
2. Extract rules in clusters:
* MUST rules first (enforceable or checklist)
* SHOULD rules next
* WARN rules last
3. For each rule:
* add `keywords` for searchability
* add tags for overrides/routing
* add minimal `exceptions` when needed (avoid overfitting)
## “Degraded mode” considerations during extraction
For each category batch, include some rules that specifically target degraded inputs:
* hard-wrap repair suggestions
* heading inference warnings
* link sanitation and encoding fixes
* Unicode normalization notes
These rules should generally be `warn` or `should`, unless they prevent corruption (then `must`).
--- END FILE ---