1580 lines
40 KiB
Text
1580 lines
40 KiB
Text
--- FILE: spec/schema/rule.schema.json ---
|
||
{
|
||
"$schema": "[https://json-schema.org/draft/2020-12/schema](https://json-schema.org/draft/2020-12/schema)",
|
||
"$id": "[https://example.invalid/pubstyle/spec/schema/rule.schema.json](https://example.invalid/pubstyle/spec/schema/rule.schema.json)",
|
||
"title": "Publication-quality rule record",
|
||
"type": "object",
|
||
"additionalProperties": false,
|
||
"required": [
|
||
"id",
|
||
"title",
|
||
"source_refs",
|
||
"category",
|
||
"severity",
|
||
"applies_to",
|
||
"rule_text",
|
||
"rationale",
|
||
"enforcement",
|
||
"autofix",
|
||
"autofix_notes",
|
||
"tags",
|
||
"keywords",
|
||
"dependencies",
|
||
"exceptions",
|
||
"status"
|
||
],
|
||
"properties": {
|
||
"id": {
|
||
"type": "string",
|
||
"description": "Stable rule identifier. Prefix must be one of CMOS, BRING, HOUSE.",
|
||
"minLength": 6,
|
||
"maxLength": 120,
|
||
"pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$"
|
||
},
|
||
"title": {
|
||
"type": "string",
|
||
"description": "Short human-readable rule title.",
|
||
"minLength": 4,
|
||
"maxLength": 160
|
||
},
|
||
"source_refs": {
|
||
"type": "array",
|
||
"description": "Pointers back to sources. Must be pointers, not quotes. Prefer: "CMOS18 §X.Y pN" / "BRING §X.Y pN" / "HOUSE §X.Y pN".",
|
||
"minItems": 1,
|
||
"items": {
|
||
"type": "string",
|
||
"minLength": 8,
|
||
"maxLength": 120,
|
||
"pattern": "^(CMOS18|BRING|HOUSE)\s§[0-9A-Za-z][0-9A-Za-z.\-]*\s p[0-9ivxlcdmIVXLCDM]+(?:-[0-9ivxlcdmIVXLCDM]+)?(?:\s\(scan p[0-9]+\))?$"
|
||
}
|
||
},
|
||
"category": {
|
||
"type": "string",
|
||
"description": "Primary taxonomy bucket.",
|
||
"enum": [
|
||
"editorial",
|
||
"typography",
|
||
"layout",
|
||
"headings",
|
||
"citations",
|
||
"numbers",
|
||
"punctuation",
|
||
"abbreviations",
|
||
"links",
|
||
"tables",
|
||
"figures",
|
||
"code",
|
||
"frontmatter",
|
||
"backmatter",
|
||
"accessibility",
|
||
"i18n"
|
||
]
|
||
},
|
||
"severity": {
|
||
"type": "string",
|
||
"description": "Normativity level. MUST blocks release unless downgraded by profile.",
|
||
"enum": ["must", "should", "warn"]
|
||
},
|
||
"applies_to": {
|
||
"type": "string",
|
||
"description": "Which pipeline stage(s) the rule targets.",
|
||
"enum": ["md", "html", "pdf", "all"]
|
||
},
|
||
"rule_text": {
|
||
"type": "string",
|
||
"description": "Paraphrased rule statement (no long quotes). If exact wording matters, note: "Exact wording required—refer to pointer".",
|
||
"minLength": 10,
|
||
"maxLength": 800
|
||
},
|
||
"rationale": {
|
||
"type": "string",
|
||
"description": "One-line rationale.",
|
||
"minLength": 5,
|
||
"maxLength": 200
|
||
},
|
||
"enforcement": {
|
||
"type": "string",
|
||
"description": "Primary enforcement mechanism.",
|
||
"enum": ["lint", "typeset", "postrender", "manual"]
|
||
},
|
||
"autofix": {
|
||
"type": "string",
|
||
"description": "Autofix capability, if any.",
|
||
"enum": ["none", "rewrite", "reflow", "suggest"]
|
||
},
|
||
"autofix_notes": {
|
||
"type": "string",
|
||
"description": "Notes describing what can be fixed and how/when. Keep short; never include book quotes.",
|
||
"maxLength": 400
|
||
},
|
||
"tags": {
|
||
"type": "array",
|
||
"description": "Compact labels for routing/search/overrides (e.g., 'manual_checklist=true', 'widows_orphans', 'hyphenation').",
|
||
"items": {
|
||
"type": "string",
|
||
"minLength": 1,
|
||
"maxLength": 48,
|
||
"pattern": "^[a-z0-9][a-z0-9_.:\-/]*(?:=[a-z0-9_.:\-/]+)?$"
|
||
},
|
||
"maxItems": 64
|
||
},
|
||
"keywords": {
|
||
"type": "array",
|
||
"description": "Search keywords (human-oriented; not necessarily normalized).",
|
||
"items": {
|
||
"type": "string",
|
||
"minLength": 2,
|
||
"maxLength": 48
|
||
},
|
||
"maxItems": 64
|
||
},
|
||
"dependencies": {
|
||
"type": "array",
|
||
"description": "Rule IDs that should be applied/understood first.",
|
||
"items": {
|
||
"type": "string",
|
||
"pattern": "^(CMOS|BRING|HOUSE)\.[A-Z0-9_]+(?:\.[A-Z0-9_]+)*$"
|
||
},
|
||
"maxItems": 32
|
||
},
|
||
"exceptions": {
|
||
"type": "array",
|
||
"description": "Free-text exceptions/caveats. Keep concise.",
|
||
"items": {
|
||
"type": "string",
|
||
"minLength": 3,
|
||
"maxLength": 240
|
||
},
|
||
"maxItems": 32
|
||
},
|
||
"examples_ref": {
|
||
"type": "array",
|
||
"description": "Optional references to separately stored examples (see spec/examples/README.md).",
|
||
"items": {
|
||
"type": "string",
|
||
"minLength": 6,
|
||
"maxLength": 80,
|
||
"pattern": "^EX\.[A-Z0-9_]+\.[A-Z0-9_]+\.[0-9]{3,}$"
|
||
},
|
||
"maxItems": 64
|
||
},
|
||
"implementation_notes": {
|
||
"type": "string",
|
||
"description": "Optional short notes for implementers (no quotes).",
|
||
"minLength": 3,
|
||
"maxLength": 600
|
||
},
|
||
"status": {
|
||
"type": "string",
|
||
"description": "Lifecycle state.",
|
||
"enum": ["draft", "active", "deprecated"]
|
||
}
|
||
},
|
||
"allOf": [
|
||
{
|
||
"if": {
|
||
"properties": {
|
||
"autofix": { "enum": ["rewrite", "reflow", "suggest"] }
|
||
},
|
||
"required": ["autofix"]
|
||
},
|
||
"then": {
|
||
"properties": {
|
||
"autofix_notes": { "minLength": 1 }
|
||
}
|
||
}
|
||
}
|
||
]
|
||
}
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/manifest.yaml ---
|
||
version: "0.1.0"
|
||
registry_id: "pubstyle"
|
||
description: >
|
||
Machine-readable style+typesetting rules for a Markdown→HTML→PDF pipeline,
|
||
backed by primary references (Chicago / Bringhurst) and optional house rules.
|
||
Rules are paraphrases only; sources are referenced by pointer strings.
|
||
|
||
id_naming:
|
||
prefixes:
|
||
CMOS: "Editorial/style usage rules derived primarily from Chicago."
|
||
BRING: "Typographic/layout rules derived primarily from Bringhurst."
|
||
HOUSE: "Project-specific rules not directly sourced to Chicago/Bringhurst."
|
||
pattern: "PREFIX.DOMAIN.TOPIC[.SUBTOPIC[.DETAIL...]]"
|
||
delimiter: "."
|
||
casing: "UPPER_SNAKE for segments"
|
||
stability:
|
||
rule_ids_are_immutable: true
|
||
rename_policy: "Deprecate old id; introduce new id; keep mapping in report diffs."
|
||
examples:
|
||
- "CMOS.PUNCTUATION.DASHES.EM_DASH"
|
||
- "BRING.LAYOUT.WIDOWS_ORPHANS.AVOID"
|
||
- "HOUSE.CITATIONS.DOI.PREFER_HTTPS"
|
||
|
||
source_pointer_scheme:
|
||
goal: "Provide auditable traceability without reproducing sources."
|
||
pointer_format_primary: "CMOS18 §<section> p<book_page>"
|
||
pointer_format_secondary: "BRING §<section> p<book_page>"
|
||
pointer_format_house: "HOUSE §<section> p<doc_page>"
|
||
optional_scan_hint: "(scan p<pdf_page_index>)"
|
||
allowed_page_numbering: ["arabic", "roman"]
|
||
notes:
|
||
- "Pointers must be sufficient for a reader with the book to locate the guidance."
|
||
- "Never store verbatim passages; paraphrase only."
|
||
- "If a rule depends on exact wording, rule_text must say: 'Exact wording required—refer to pointer'."
|
||
|
||
category_taxonomy:
|
||
|
||
* editorial
|
||
* typography
|
||
* layout
|
||
* headings
|
||
* citations
|
||
* numbers
|
||
* punctuation
|
||
* abbreviations
|
||
* links
|
||
* tables
|
||
* figures
|
||
* code
|
||
* frontmatter
|
||
* backmatter
|
||
* accessibility
|
||
* i18n
|
||
|
||
profiles:
|
||
|
||
* web_pdf
|
||
* print_pdf
|
||
* dense_tech
|
||
* memo
|
||
* slide_deck
|
||
|
||
planned_rule_counts:
|
||
target_total_range: [800, 1500]
|
||
target_by_category:
|
||
editorial: 120
|
||
typography: 170
|
||
layout: 140
|
||
headings: 70
|
||
citations: 140
|
||
numbers: 90
|
||
punctuation: 120
|
||
abbreviations: 60
|
||
links: 50
|
||
tables: 60
|
||
figures: 50
|
||
code: 70
|
||
frontmatter: 40
|
||
backmatter: 40
|
||
accessibility: 90
|
||
i18n: 60
|
||
|
||
coverage_contract:
|
||
must_rules:
|
||
enforceability_requirement: >
|
||
Every MUST rule must be enforceable by at least one of: lint, typeset, postrender;
|
||
otherwise it must be explicitly labeled as a manual checklist item and emitted in
|
||
a checklist output artifact.
|
||
manual_checklist_tag: "manual_checklist=true"
|
||
checklist_artifact: "manual-checklist.md (and JSON mirror)"
|
||
should_rules:
|
||
policy: "Should rules should be enforceable when practical; otherwise allowed as manual with explicit rationale."
|
||
warn_rules:
|
||
policy: "Warnings may be non-blocking and advisory; still require source pointers."
|
||
enforcement_definitions:
|
||
lint: "Static analysis over normalized Markdown/HTML AST. Deterministic."
|
||
typeset: "CSS/tokens shaping decisions prior to rendering (pagination, keeps, hyphenation parameters)."
|
||
postrender: "PDF/HTML layout inspection (widows/orphans, overflow, keep failures, numbering mismatches)."
|
||
manual: "Human review; system must still produce checklist items and traceability pointers."
|
||
ci_guardrails:
|
||
coverage_floor:
|
||
must_implemented_min_percent: 95
|
||
overall_implemented_min_percent: 80
|
||
regression_rule: "CI fails if implemented coverage decreases from main branch."
|
||
|
||
degraded_mode_contract:
|
||
purpose: "Handle badly-structured inputs safely without crashing; still provide useful output."
|
||
triggers:
|
||
- "Markdown parse errors / invalid UTF-8"
|
||
- "Missing heading hierarchy (no H1/H2 etc.)"
|
||
- "Garbage extraction (e.g., line breaks every word, excessive hard wraps)"
|
||
- "Mixed language with no lang metadata"
|
||
behavior:
|
||
normalize:
|
||
attempt_repairs:
|
||
- "Normalize whitespace and line endings"
|
||
- "Detect and unwrap hard-wrapped paragraphs heuristically"
|
||
- "Infer heading levels from patterns (e.g., '1.', '1.1', ALL CAPS lines) with low confidence"
|
||
if_unrecoverable:
|
||
- "Fall back to minimal AST: paragraphs + code blocks + raw spans"
|
||
- "Mark document structure confidence = low"
|
||
enforcement_in_degraded_mode:
|
||
lint:
|
||
run_subset: ["safety", "sanity", "catastrophic typography (double spaces, broken links)"]
|
||
downgrade_some_must_to_warn: true
|
||
typeset:
|
||
use_fallback_tokens: true
|
||
disable_aggressive_hyphenation: true
|
||
postrender:
|
||
run_core_gates_only: ["overfull_lines", "table_overflow_incidents", "code_overflow_incidents"]
|
||
reporting:
|
||
always_emit:
|
||
- "layout-report.json"
|
||
- "coverage-report.json"
|
||
- "degraded-mode-report.json (what was inferred and why)"
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/profiles/web_pdf.yaml ---
|
||
profile_id: "web_pdf"
|
||
description: "Screen-first PDF for sharing and reading; conservative pagination and strong accessibility defaults."
|
||
|
||
page:
|
||
size: "Letter"
|
||
orientation: "portrait"
|
||
two_sided: false
|
||
margins:
|
||
top: "22mm"
|
||
bottom: "22mm"
|
||
inner: "20mm"
|
||
outer: "20mm"
|
||
|
||
fonts:
|
||
body:
|
||
family: ["Noto Serif", "STIX Two Text", "Times New Roman", "serif"]
|
||
size: "11pt"
|
||
line_height: 1.45
|
||
heading:
|
||
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
|
||
mono:
|
||
family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"]
|
||
size: "10pt"
|
||
line_height: 1.35
|
||
|
||
measure_targets:
|
||
columns: 1
|
||
body_chars_per_line:
|
||
min: 55
|
||
ideal: 66
|
||
max: 75
|
||
footnote_chars_per_line:
|
||
min: 50
|
||
ideal: 60
|
||
max: 70
|
||
|
||
hyphenation:
|
||
enabled: true
|
||
strategy: "balanced"
|
||
language_driven: true
|
||
min_left: 2
|
||
min_right: 3
|
||
max_consecutive_hyphenated_lines: 2
|
||
avoid_proper_names_when_possible: true
|
||
avoid_after_short_lines: true
|
||
|
||
paragraphs:
|
||
first_paragraph_indent: "0"
|
||
indent: "1em"
|
||
block_paragraph_spacing: "0.6em"
|
||
|
||
headings:
|
||
keep_with_next_lines: 2
|
||
avoid_stranded_headings: true
|
||
numbering:
|
||
enabled: true
|
||
style: "decimal"
|
||
require_monotonic_increase: true
|
||
|
||
widows_orphans:
|
||
widow_lines: 2
|
||
orphan_lines: 2
|
||
balance_facing_pages: false
|
||
|
||
code:
|
||
inline:
|
||
use_mono: true
|
||
block:
|
||
font_size: "9.5pt"
|
||
line_height: 1.35
|
||
wrap: true
|
||
max_wrap_penalty: "medium"
|
||
overflow_policy: "wrap_then_shrink_minor"
|
||
shrink_limit: 0.92
|
||
|
||
tables:
|
||
cell_padding: "3pt 6pt"
|
||
header_repeat: true
|
||
overflow_policy: "shrink_then_wrap"
|
||
shrink_limit: 0.9
|
||
|
||
severity_overrides:
|
||
|
||
* selector: { category: "layout", tag: "widows_orphans" }
|
||
severity: "should"
|
||
* selector: { category: "accessibility" }
|
||
severity: "must"
|
||
|
||
locale_defaults:
|
||
primary_language: "en"
|
||
fallback_languages: ["fr"]
|
||
quotation_style: "us" # curly quotes, US punctuation conventions
|
||
date_format: "YYYY-MM-DD"
|
||
number_format:
|
||
decimal_separator: "."
|
||
thousands_separator: ","
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/profiles/print_pdf.yaml ---
|
||
profile_id: "print_pdf"
|
||
description: "Print-oriented PDF with stricter pagination, book-like rhythm, and stronger keep constraints."
|
||
|
||
page:
|
||
size: "6in×9in"
|
||
orientation: "portrait"
|
||
two_sided: true
|
||
margins:
|
||
top: "18mm"
|
||
bottom: "20mm"
|
||
inner: "22mm"
|
||
outer: "18mm"
|
||
|
||
fonts:
|
||
body:
|
||
family: ["STIX Two Text", "Noto Serif", "Georgia", "serif"]
|
||
size: "10.5pt"
|
||
line_height: 1.50
|
||
heading:
|
||
family: ["STIX Two Text", "Noto Serif", "serif"]
|
||
mono:
|
||
family: ["Noto Sans Mono", "Source Code Pro", "Consolas", "monospace"]
|
||
size: "9.5pt"
|
||
line_height: 1.30
|
||
|
||
measure_targets:
|
||
columns: 1
|
||
body_chars_per_line:
|
||
min: 55
|
||
ideal: 66
|
||
max: 72
|
||
|
||
hyphenation:
|
||
enabled: true
|
||
strategy: "print_quality"
|
||
min_left: 2
|
||
min_right: 3
|
||
max_consecutive_hyphenated_lines: 2
|
||
avoid_proper_names_when_possible: true
|
||
|
||
paragraphs:
|
||
first_paragraph_indent: "0"
|
||
indent: "1em"
|
||
|
||
headings:
|
||
keep_with_next_lines: 3
|
||
avoid_stranded_headings: true
|
||
numbering:
|
||
enabled: true
|
||
style: "decimal"
|
||
require_monotonic_increase: true
|
||
|
||
widows_orphans:
|
||
widow_lines: 2
|
||
orphan_lines: 2
|
||
balance_facing_pages: true
|
||
|
||
code:
|
||
block:
|
||
font_size: "9pt"
|
||
line_height: 1.25
|
||
wrap: false
|
||
overflow_policy: "shrink_then_scroll_indicator"
|
||
shrink_limit: 0.90
|
||
|
||
tables:
|
||
cell_padding: "2.5pt 5pt"
|
||
header_repeat: true
|
||
overflow_policy: "shrink_then_rotate_if_allowed"
|
||
shrink_limit: 0.88
|
||
|
||
severity_overrides:
|
||
|
||
* selector: { category: "layout", tag: "widows_orphans" }
|
||
severity: "must"
|
||
* selector: { category: "layout", tag: "keep_constraints" }
|
||
severity: "must"
|
||
* selector: { category: "typography", tag: "spacing_consistency" }
|
||
severity: "must"
|
||
|
||
locale_defaults:
|
||
primary_language: "en"
|
||
fallback_languages: ["fr"]
|
||
quotation_style: "us"
|
||
date_format: "Month D, YYYY"
|
||
number_format:
|
||
decimal_separator: "."
|
||
thousands_separator: ","
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/profiles/dense_tech.yaml ---
|
||
profile_id: "dense_tech"
|
||
description: "Technical papers and specs: denser copy, more code/table tolerance, strict numbering and citations."
|
||
|
||
page:
|
||
size: "A4"
|
||
orientation: "portrait"
|
||
two_sided: false
|
||
margins:
|
||
top: "18mm"
|
||
bottom: "18mm"
|
||
inner: "18mm"
|
||
outer: "18mm"
|
||
|
||
fonts:
|
||
body:
|
||
family: ["Noto Serif", "STIX Two Text", "serif"]
|
||
size: "10pt"
|
||
line_height: 1.35
|
||
heading:
|
||
family: ["Noto Sans", "Source Sans 3", "sans-serif"]
|
||
mono:
|
||
family: ["Noto Sans Mono", "Source Code Pro", "monospace"]
|
||
size: "9pt"
|
||
line_height: 1.25
|
||
|
||
measure_targets:
|
||
columns: 1
|
||
body_chars_per_line:
|
||
min: 65
|
||
ideal: 75
|
||
max: 90
|
||
|
||
hyphenation:
|
||
enabled: true
|
||
strategy: "balanced"
|
||
min_left: 2
|
||
min_right: 3
|
||
max_consecutive_hyphenated_lines: 3
|
||
avoid_proper_names_when_possible: true
|
||
|
||
headings:
|
||
keep_with_next_lines: 2
|
||
avoid_stranded_headings: true
|
||
numbering:
|
||
enabled: true
|
||
style: "decimal"
|
||
require_monotonic_increase: true
|
||
|
||
widows_orphans:
|
||
widow_lines: 2
|
||
orphan_lines: 2
|
||
balance_facing_pages: false
|
||
|
||
code:
|
||
block:
|
||
font_size: "8.8pt"
|
||
line_height: 1.20
|
||
wrap: true
|
||
overflow_policy: "wrap_then_shrink_minor"
|
||
shrink_limit: 0.90
|
||
|
||
tables:
|
||
cell_padding: "2pt 4pt"
|
||
header_repeat: true
|
||
overflow_policy: "shrink_then_wrap"
|
||
shrink_limit: 0.85
|
||
|
||
severity_overrides:
|
||
|
||
* selector: { category: "citations" }
|
||
severity: "must"
|
||
* selector: { category: "headings", tag: "numbering" }
|
||
severity: "must"
|
||
* selector: { category: "layout", tag: "widows_orphans" }
|
||
severity: "should"
|
||
|
||
locale_defaults:
|
||
primary_language: "en"
|
||
fallback_languages: ["fr"]
|
||
quotation_style: "us"
|
||
date_format: "YYYY-MM-DD"
|
||
number_format:
|
||
decimal_separator: "."
|
||
thousands_separator: ","
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/profiles/memo.yaml ---
|
||
profile_id: "memo"
|
||
description: "Short internal documents: lenient pagination, strong clarity, minimal typographic complexity."
|
||
|
||
page:
|
||
size: "Letter"
|
||
orientation: "portrait"
|
||
two_sided: false
|
||
margins:
|
||
top: "1in"
|
||
bottom: "1in"
|
||
inner: "1in"
|
||
outer: "1in"
|
||
|
||
fonts:
|
||
body:
|
||
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
|
||
size: "11pt"
|
||
line_height: 1.40
|
||
heading:
|
||
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
|
||
mono:
|
||
family: ["Noto Sans Mono", "Consolas", "monospace"]
|
||
size: "10pt"
|
||
line_height: 1.30
|
||
|
||
measure_targets:
|
||
columns: 1
|
||
body_chars_per_line:
|
||
min: 55
|
||
ideal: 70
|
||
max: 85
|
||
|
||
hyphenation:
|
||
enabled: false
|
||
strategy: "off_for_memos"
|
||
|
||
headings:
|
||
keep_with_next_lines: 2
|
||
avoid_stranded_headings: true
|
||
numbering:
|
||
enabled: false
|
||
|
||
widows_orphans:
|
||
widow_lines: 1
|
||
orphan_lines: 1
|
||
balance_facing_pages: false
|
||
|
||
code:
|
||
block:
|
||
font_size: "9.5pt"
|
||
line_height: 1.25
|
||
wrap: true
|
||
overflow_policy: "wrap"
|
||
shrink_limit: 1.0
|
||
|
||
tables:
|
||
cell_padding: "3pt 6pt"
|
||
header_repeat: false
|
||
overflow_policy: "wrap"
|
||
shrink_limit: 1.0
|
||
|
||
severity_overrides:
|
||
|
||
* selector: { category: "layout", tag: "widows_orphans" }
|
||
severity: "warn"
|
||
* selector: { category: "accessibility" }
|
||
severity: "must"
|
||
|
||
locale_defaults:
|
||
primary_language: "en"
|
||
fallback_languages: ["fr"]
|
||
quotation_style: "us"
|
||
date_format: "YYYY-MM-DD"
|
||
number_format:
|
||
decimal_separator: "."
|
||
thousands_separator: ","
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/profiles/slide_deck.yaml ---
|
||
profile_id: "slide_deck"
|
||
description: "Paged slides (16:9). Emphasis on hierarchy, short lines, and avoiding overflows."
|
||
|
||
page:
|
||
size: "13.333in×7.5in" # 16:9 at common PPT dimensions
|
||
orientation: "landscape"
|
||
two_sided: false
|
||
margins:
|
||
top: "0.5in"
|
||
bottom: "0.5in"
|
||
inner: "0.6in"
|
||
outer: "0.6in"
|
||
|
||
fonts:
|
||
body:
|
||
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
|
||
size: "24pt"
|
||
line_height: 1.15
|
||
heading:
|
||
family: ["Noto Sans", "Source Sans 3", "Arial", "sans-serif"]
|
||
mono:
|
||
family: ["Noto Sans Mono", "Consolas", "monospace"]
|
||
size: "20pt"
|
||
line_height: 1.10
|
||
|
||
measure_targets:
|
||
columns: 1
|
||
body_chars_per_line:
|
||
min: 25
|
||
ideal: 40
|
||
max: 55
|
||
|
||
hyphenation:
|
||
enabled: false
|
||
strategy: "off_for_slides"
|
||
|
||
headings:
|
||
keep_with_next_lines: 1
|
||
avoid_stranded_headings: true
|
||
numbering:
|
||
enabled: false
|
||
|
||
widows_orphans:
|
||
widow_lines: 1
|
||
orphan_lines: 1
|
||
balance_facing_pages: false
|
||
|
||
code:
|
||
block:
|
||
font_size: "18pt"
|
||
line_height: 1.10
|
||
wrap: true
|
||
overflow_policy: "wrap_then_shrink_minor"
|
||
shrink_limit: 0.92
|
||
|
||
tables:
|
||
cell_padding: "6pt 10pt"
|
||
header_repeat: false
|
||
overflow_policy: "shrink_then_wrap"
|
||
shrink_limit: 0.88
|
||
|
||
severity_overrides:
|
||
|
||
* selector: { category: "layout", tag: "overflow" }
|
||
severity: "must"
|
||
* selector: { category: "accessibility" }
|
||
severity: "must"
|
||
|
||
locale_defaults:
|
||
primary_language: "en"
|
||
fallback_languages: ["fr"]
|
||
quotation_style: "us"
|
||
date_format: "YYYY-MM-DD"
|
||
number_format:
|
||
decimal_separator: "."
|
||
thousands_separator: ","
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/quality_gates.yaml ---
|
||
version: "0.1.0"
|
||
description: >
|
||
Post-render QA gates. All thresholds are hard numeric limits used to fail builds
|
||
(unless a gate is explicitly marked as "warn-only" by the invoking CLI flags).
|
||
|
||
metrics:
|
||
max_widows_per_10_pages: "Count of widow lines across any 10 consecutive pages."
|
||
max_orphans_per_10_pages: "Count of orphan lines across any 10 consecutive pages."
|
||
max_stranded_headings: "Count of headings at page bottom with insufficient following content per keep rule."
|
||
max_overfull_lines: "Count of lines exceeding measure by overflow threshold (render-time measured)."
|
||
max_table_overflow_incidents: "Count of tables that overflow page/column bounds or are clipped."
|
||
max_code_overflow_incidents: "Count of code blocks with horizontal overflow or clipping."
|
||
max_link_wrap_incidents: "Count of wrapped URLs/DOIs/emails violating link wrap policy."
|
||
max_heading_numbering_errors: "Count of numbering sequence/format violations."
|
||
max_citation_format_errors: "Count of citations not matching configured style format."
|
||
|
||
overflow_detection:
|
||
overfull_line_threshold_css_px: 1.0
|
||
consider_clipping_as_overflow: true
|
||
ignore_decorative_elements: true
|
||
|
||
profiles:
|
||
web_pdf:
|
||
default:
|
||
max_widows_per_10_pages: 1
|
||
max_orphans_per_10_pages: 1
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 2
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 1
|
||
max_link_wrap_incidents: 2
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
strict:
|
||
max_widows_per_10_pages: 0
|
||
max_orphans_per_10_pages: 0
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 0
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 0
|
||
max_link_wrap_incidents: 0
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
|
||
print_pdf:
|
||
default:
|
||
max_widows_per_10_pages: 0
|
||
max_orphans_per_10_pages: 0
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 0
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 0
|
||
max_link_wrap_incidents: 0
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
strict:
|
||
max_widows_per_10_pages: 0
|
||
max_orphans_per_10_pages: 0
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 0
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 0
|
||
max_link_wrap_incidents: 0
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
|
||
dense_tech:
|
||
default:
|
||
max_widows_per_10_pages: 1
|
||
max_orphans_per_10_pages: 1
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 3
|
||
max_table_overflow_incidents: 1
|
||
max_code_overflow_incidents: 2
|
||
max_link_wrap_incidents: 3
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
strict:
|
||
max_widows_per_10_pages: 0
|
||
max_orphans_per_10_pages: 0
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 1
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 0
|
||
max_link_wrap_incidents: 1
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
|
||
memo:
|
||
default:
|
||
max_widows_per_10_pages: 3
|
||
max_orphans_per_10_pages: 3
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 2
|
||
max_table_overflow_incidents: 1
|
||
max_code_overflow_incidents: 1
|
||
max_link_wrap_incidents: 4
|
||
max_heading_numbering_errors: 1
|
||
max_citation_format_errors: 1
|
||
strict:
|
||
max_widows_per_10_pages: 1
|
||
max_orphans_per_10_pages: 1
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 0
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 0
|
||
max_link_wrap_incidents: 2
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
|
||
slide_deck:
|
||
default:
|
||
max_widows_per_10_pages: 5
|
||
max_orphans_per_10_pages: 5
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 0
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 0
|
||
max_link_wrap_incidents: 0
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 1
|
||
strict:
|
||
max_widows_per_10_pages: 2
|
||
max_orphans_per_10_pages: 2
|
||
max_stranded_headings: 0
|
||
max_overfull_lines: 0
|
||
max_table_overflow_incidents: 0
|
||
max_code_overflow_incidents: 0
|
||
max_link_wrap_incidents: 0
|
||
max_heading_numbering_errors: 0
|
||
max_citation_format_errors: 0
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/indexes/README.md ---
|
||
|
||
# Indexes
|
||
|
||
This project builds small, fast indexes so the runtime can answer questions like:
|
||
|
||
* “Which rules mention *en dash*?”
|
||
* “Which rules cite *CMOS18 §6.88 p412*?”
|
||
* “Which rules apply to `postrender` QA?”
|
||
* “What rules are overridden by the `print_pdf` profile?”
|
||
|
||
Indexes are derived artifacts (rebuildable) and should not be hand-edited.
|
||
|
||
## Indexes the app will build
|
||
|
||
### 1) keyword → rule IDs
|
||
|
||
**Purpose:** fast search/autocomplete and lint explanations.
|
||
|
||
* **Path:** `spec/indexes/keywords_all.json` and per-category deltas:
|
||
|
||
* `spec/indexes/keywords_<category>.json`
|
||
* **Format (JSON):**
|
||
|
||
* keys: normalized keyword (lowercased)
|
||
* values: array of rule IDs sorted stable (lexicographic)
|
||
|
||
Normalization (default):
|
||
|
||
* Unicode NFKC
|
||
* lowercase
|
||
* collapse whitespace
|
||
* strip surrounding punctuation
|
||
|
||
### 2) source_ref → rule IDs
|
||
|
||
**Purpose:** audit trail back to references without embedding book text.
|
||
|
||
* **Path:** `spec/indexes/source_refs_all.json` and per-category deltas:
|
||
|
||
* `spec/indexes/source_refs_<category>.json`
|
||
* **Format (JSON):**
|
||
|
||
* keys: exact `source_ref` pointer strings
|
||
* values: array of rule IDs
|
||
|
||
### 3) category → rule IDs
|
||
|
||
**Purpose:** batch reporting, extraction coverage, profile scoping.
|
||
|
||
* **Path:** `spec/indexes/category.json`
|
||
* **Format (JSON):**
|
||
|
||
* keys: category name
|
||
* values: array of rule IDs
|
||
|
||
### 4) enforcement → rule IDs
|
||
|
||
**Purpose:** quickly decide which engine (lint/typeset/postrender/manual) handles which rules.
|
||
|
||
* **Path:** `spec/indexes/enforcement.json`
|
||
|
||
### 5) profile overrides
|
||
|
||
**Purpose:** allow profiles to override severity or token parameters without editing rules.
|
||
|
||
* **Path:** `spec/indexes/profile_overrides.json`
|
||
* **Format (JSON):**
|
||
|
||
* per profile: list of override objects (selector + action)
|
||
* selectors may match category, tags, applies_to, or explicit rule IDs
|
||
|
||
## Build guarantees
|
||
|
||
* Index builds are deterministic from:
|
||
|
||
* `spec/rules/**.ndjson`
|
||
* `spec/profiles/*.yaml`
|
||
* `spec/manifest.yaml`
|
||
|
||
* The runtime must treat indexes as **cacheable**:
|
||
|
||
* if index missing/outdated → rebuild or fallback to scanning rule files.
|
||
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/examples/README.md ---
|
||
|
||
# Examples
|
||
|
||
Rules stay compact and machine-enforceable; examples live separately to avoid bloating the rule registry.
|
||
|
||
## Goals
|
||
|
||
* Provide **concrete fixtures** for:
|
||
|
||
* unit tests (lint, autofix, typeset transforms)
|
||
* integration tests (render + QA gates)
|
||
* documentation (human-readable “why this matters”)
|
||
|
||
* Keep examples **small** (a few lines) and **targeted** (each example triggers a known set of rules).
|
||
|
||
## Example ID format
|
||
|
||
`EX.<CATEGORY>.<TOPIC>.<NNN>`
|
||
|
||
* `CATEGORY` must match the category taxonomy (e.g., `PUNCTUATION`, `NUMBERS`, `CITATIONS`)
|
||
* `TOPIC` is an uppercase short slug
|
||
* `NNN` is a zero-padded integer (000–999+)
|
||
|
||
Example:
|
||
|
||
* `EX.PUNCTUATION.DASHES.001`
|
||
|
||
## Suggested on-disk layout
|
||
|
||
* `spec/examples/<category>/EX.<CATEGORY>.<TOPIC>.<NNN>.yaml`
|
||
* `spec/examples/<category>/fixtures/<name>.md` (optional)
|
||
|
||
## Example YAML format (recommended)
|
||
|
||
Fields:
|
||
|
||
* `id` (required): example ID
|
||
* `rules` (required): list of rule IDs the example is meant to exercise
|
||
* `before` (required): inline Markdown or a reference to a fixture file
|
||
* `after` (optional): expected Markdown after autofix (if autofix exists)
|
||
* `expected` (optional): expected diagnostics/gates
|
||
|
||
* `lint_errors`: array of rule IDs expected as errors
|
||
* `lint_warnings`: array of rule IDs expected as warnings
|
||
* `qa_failures`: array of gate keys expected to fail
|
||
* `notes` (optional): short human explanation (no book quotes)
|
||
|
||
Minimal example skeleton:
|
||
|
||
* id: EX.PUNCTUATION.DASHES.001
|
||
rules:
|
||
|
||
* CMOS.PUNCTUATION.DASHES.EM_DASH
|
||
before: |
|
||
...
|
||
after: |
|
||
...
|
||
expected:
|
||
lint_errors: [CMOS.PUNCTUATION.DASHES.EM_DASH]
|
||
|
||
## Test corpus strategy
|
||
|
||
Maintain a small, curated corpus that triggers:
|
||
|
||
1. Lint-only issues (AST-level)
|
||
|
||
* punctuation spacing
|
||
* numeral formatting
|
||
* heading numbering patterns
|
||
* link normalization / unsafe URLs
|
||
* citation field completeness
|
||
|
||
2. Typeset-only issues (token/CSS decisions)
|
||
|
||
* paragraph indentation patterns
|
||
* code block wrapping rules
|
||
* table overflow strategies
|
||
|
||
3. Post-render QA issues (PDF/HTML layout)
|
||
|
||
* widows/orphans
|
||
* stranded headings (keep-with-next)
|
||
* overfull lines (especially monospace/code)
|
||
* table/caption overflow and clipping
|
||
|
||
Recommended corpus sizing:
|
||
|
||
* 30–80 fixtures total
|
||
* each fixture should target 3–10 rules max
|
||
* include “degraded mode” fixtures (intentionally malformed Markdown)
|
||
|
||
--- END FILE ---
|
||
|
||
--- FILE: app/ARCHITECTURE.md ---
|
||
|
||
# Runtime Architecture
|
||
|
||
This is a thin, deterministic runtime that:
|
||
|
||
A) ingests Markdown → normalizes a document AST → applies editorial lint (Chicago-derived)
|
||
B) applies typeset tokens/profile (Bringhurst-derived)
|
||
C) renders HTML and PDF deterministically
|
||
D) runs post-render QA gates (widows/orphans, heading keeps, overflow)
|
||
E) generates `layout-report.json` and fails builds when thresholds are exceeded
|
||
|
||
Primary reference PDFs provided to the system (for pointer-based rules and traceability only):
|
||
|
||
* The Chicago Manual of Style (18th ed).pdf
|
||
* Robert Bringhurst – The Elements of Typographic Style.pdf
|
||
|
||
No bulk transcription is performed; rules are paraphrases and cite sources only by pointer.
|
||
|
||
## Components
|
||
|
||
### 1) Registry Loader
|
||
|
||
Inputs:
|
||
|
||
* `spec/rules/**.ndjson` (Phase 2 output)
|
||
* `spec/schema/rule.schema.json`
|
||
* `spec/manifest.yaml`
|
||
* `spec/profiles/*.yaml`
|
||
* `spec/quality_gates.yaml`
|
||
|
||
Responsibilities:
|
||
|
||
* validate each rule against JSON Schema
|
||
* enforce ID uniqueness and stable sorting
|
||
* build or load indexes in `spec/indexes/*.json`
|
||
* compute coverage (implemented vs unimplemented; by enforcement)
|
||
|
||
Output (in-memory):
|
||
|
||
* `RuleStore` (rules + indexes + profile overrides + gate thresholds)
|
||
|
||
### 2) Markdown Ingest + AST Normalization
|
||
|
||
Steps:
|
||
|
||
1. Parse Markdown to an AST (mdast or equivalent).
|
||
2. Normalize to a stable internal schema:
|
||
|
||
* heading levels and numbering metadata
|
||
* lists and list tight/loose semantics
|
||
* code spans/blocks with language tags
|
||
* tables (GFM) to a consistent representation
|
||
* links normalized (url, title, text)
|
||
* citations normalized (if present as syntax/extensions)
|
||
3. Produce `normalized-doc.json` for debugging reproducibility.
|
||
|
||
Degraded mode:
|
||
|
||
* If parsing fails or structure is missing, switch to minimal node set and mark `structure_confidence: low`.
|
||
* Run the “degraded mode contract” from `spec/manifest.yaml`.
|
||
|
||
### 3) Editorial Lint Engine
|
||
|
||
What it does:
|
||
|
||
* Runs `lint`-enforced rules against normalized AST.
|
||
* Emits diagnostics:
|
||
|
||
* `severity` (must/should/warn after profile overrides)
|
||
* `rule_id`
|
||
* location (source span) and node path
|
||
* message (generated from rule metadata + implementation hints)
|
||
|
||
Autofix:
|
||
|
||
* If a rule’s `autofix` is `rewrite` or `suggest`, produce:
|
||
|
||
* patched Markdown (rewrite) OR
|
||
* suggestion blocks with exact spans to edit (suggest)
|
||
* Autofix must be deterministic and reversible (keep a patch log).
|
||
|
||
Artifacts:
|
||
|
||
* `lint-report.json`
|
||
* `lint-report.sarif` (optional for CI UIs)
|
||
* `lint-fixed.md` (optional, if autofix applied)
|
||
|
||
### 4) Typeset Profile Engine
|
||
|
||
Goal:
|
||
|
||
* Convert “typographic intent” into deterministic render inputs:
|
||
|
||
* CSS tokens (variables)
|
||
* layout policies (widows/orphans strategy, keeps, hyphenation params)
|
||
* code/table overflow strategies
|
||
|
||
Inputs:
|
||
|
||
* normalized AST
|
||
* profile tokens from `spec/profiles/<profile>.yaml`
|
||
|
||
Outputs:
|
||
|
||
* `render.css` (tokenized CSS + paged-media rules)
|
||
* `render.html` (deterministic HTML with stable classnames/data attributes)
|
||
* `typeset-report.json` (what tokens were used, resolved font stack, measure targets)
|
||
|
||
Design principle:
|
||
|
||
* “Soft rules” are tokens; “hard rules” are lint/QA gates.
|
||
|
||
### 5) Deterministic Rendering
|
||
|
||
The runtime should treat rendering as an adapter layer.
|
||
|
||
Minimum requirement:
|
||
|
||
* Deterministic HTML generation (stable DOM order, stable IDs, stable whitespace).
|
||
* Deterministic PDF generation with pinned renderer/version and embedded fonts when possible.
|
||
|
||
Adapter concept:
|
||
|
||
* `RendererHTML`: emits HTML+CSS.
|
||
* `RendererPDF`: converts HTML+CSS to PDF using a configured engine.
|
||
|
||
Recommended renderer capabilities:
|
||
|
||
* CSS Paged Media support (page size/margins, running headers, footnotes if used)
|
||
* hyphenation dictionaries
|
||
* font embedding/subsetting
|
||
|
||
Artifacts:
|
||
|
||
* `out/<doc>.html`
|
||
* `out/<doc>.pdf`
|
||
* `out/render-log.json` (versions, timings, warnings)
|
||
|
||
### 6) Post-render QA Analyzer
|
||
|
||
Runs on:
|
||
|
||
* PDF (preferred for final layout truth) and optionally HTML.
|
||
|
||
Detects:
|
||
|
||
* widows/orphans (by paragraph line runs across pages)
|
||
* stranded headings (heading at bottom violating keep-with-next)
|
||
* overfull lines (glyph boxes exceed text block)
|
||
* table overflow/clipping
|
||
* code overflow/clipping
|
||
* link wrap incidents (URLs/DOIs split against policy)
|
||
* heading numbering errors (cross-check against AST numbering)
|
||
* citation format errors (cross-check against configured citation style)
|
||
|
||
Artifacts:
|
||
|
||
* `layout-report.json` (the canonical QA report)
|
||
* `qa-report.json` (gate evaluation + failures + excerpts as coordinates, not text)
|
||
|
||
Fail behavior:
|
||
|
||
* Compare measured metrics to `spec/quality_gates.yaml` for the chosen profile.
|
||
* Exit non-zero if any MUST-equivalent gate fails (or if `--strict` chosen, strict thresholds apply).
|
||
|
||
## Coverage Reporting and CI Guardrails
|
||
|
||
Coverage is computed from:
|
||
|
||
* total active rules
|
||
* rules with an implemented enforcement handler:
|
||
|
||
* lint implemented if rule_id has an evaluator in lint engine
|
||
* typeset implemented if token/policy exists and is applied deterministically
|
||
* postrender implemented if analyzer has a detector for that rule/tag
|
||
* manual implemented if checklist output includes it
|
||
|
||
Artifacts:
|
||
|
||
* `coverage-report.json` (counts by category, enforcement, severity, profile)
|
||
* `coverage-diff.json` (compares to baseline on main branch)
|
||
|
||
CI policy (from manifest):
|
||
|
||
* fail if MUST coverage drops
|
||
* fail if overall implemented coverage drops
|
||
* fail if rule IDs changed without deprecation mapping
|
||
|
||
## Assumptions (Phase 1 defaults)
|
||
|
||
1. The pipeline targets a CSS-based HTML→PDF engine (paged media capable).
|
||
2. Default language is `en` with optional `fr` fallback.
|
||
3. Citation style defaults to a Chicago-aligned style, but the registry will encode the exact variant in rules (Notes/Bibliography vs Author-Date) during extraction.
|
||
4. Fonts default to Noto/STIX families for broad coverage and consistent embedding, but can be overridden per profile.
|
||
|
||
No questions are strictly required to proceed with Phase 2 extraction; these assumptions can be adjusted via profiles and house rules.
|
||
|
||
--- END FILE ---
|
||
|
||
--- FILE: app/CLI_SPEC.md ---
|
||
|
||
# CLI Specification
|
||
|
||
The CLI is designed for CI use: deterministic outputs, stable exit codes, and JSON artifacts for tooling.
|
||
|
||
## Common flags (all commands)
|
||
|
||
* `--input <path>`: Markdown file or directory.
|
||
* `--out <dir>`: Output directory (default: `out/`).
|
||
* `--rules <dir>`: Rules root directory (default: `spec/rules/`).
|
||
* `--profile <name>`: One of: `web_pdf`, `print_pdf`, `dense_tech`, `memo`, `slide_deck`.
|
||
* `--strict`: Use strict thresholds in `spec/quality_gates.yaml`.
|
||
* `--format <json|sarif|text>`: Diagnostic output format (where applicable).
|
||
* `--fail-on <must|should|warn>`: Lowest severity that fails the command (default: `must`).
|
||
* `--degraded-ok`: Allow degraded mode without failing (still emits degraded-mode report).
|
||
* `--version`: Print tool + renderer versions.
|
||
|
||
## Command: `lint`
|
||
|
||
Purpose:
|
||
|
||
* Parse Markdown → normalize AST → run lint rules.
|
||
* Optionally apply autofixes.
|
||
|
||
Args:
|
||
|
||
* `--fix`: Apply autofix where `autofix != none` and safe.
|
||
* `--fix-mode <rewrite|suggest>`: Whether to rewrite output Markdown or emit suggestions only.
|
||
* `--baseline <path>`: Compare diagnostics to an existing lint report and show diff.
|
||
|
||
Outputs:
|
||
|
||
* `out/lint-report.json`
|
||
* `out/lint-report.sarif` (if `--format sarif`)
|
||
* `out/lint-fixed.md` (if `--fix` and `--fix-mode rewrite`)
|
||
* `out/manual-checklist.md` (includes manual rules tagged `manual_checklist=true`)
|
||
|
||
Exit codes:
|
||
|
||
* `0`: no failing diagnostics
|
||
* `1`: lint failures at or above `--fail-on`
|
||
* `4`: config/schema error
|
||
* `5`: internal error
|
||
|
||
## Command: `render-html`
|
||
|
||
Purpose:
|
||
|
||
* Generate deterministic HTML + CSS from normalized AST + profile tokens.
|
||
|
||
Args:
|
||
|
||
* `--emit-normalized`: also write `normalized-doc.json`
|
||
* `--assets <dir>`: static assets dir (images, fonts, etc.)
|
||
* `--self-contained`: embed assets in HTML where possible
|
||
|
||
Outputs:
|
||
|
||
* `out/render.html`
|
||
* `out/render.css`
|
||
* `out/typeset-report.json`
|
||
* `out/normalized-doc.json` (optional)
|
||
|
||
Exit codes:
|
||
|
||
* `0`: success
|
||
* `3`: render error
|
||
* `4`: config/schema error
|
||
* `5`: internal error
|
||
|
||
## Command: `render-pdf`
|
||
|
||
Purpose:
|
||
|
||
* Render PDF deterministically from HTML + CSS + assets.
|
||
|
||
Args:
|
||
|
||
* `--engine <name>`: renderer adapter selection (implementation-defined)
|
||
* `--engine-opts <json>`: pass-through engine options
|
||
* `--keep-html`: keep intermediate HTML/CSS even if PDF fails
|
||
|
||
Outputs:
|
||
|
||
* `out/render.pdf`
|
||
* `out/render.html` + `out/render.css` (always or if `--keep-html`)
|
||
* `out/render-log.json`
|
||
|
||
Exit codes:
|
||
|
||
* `0`: success
|
||
* `3`: render error
|
||
* `4`: config/schema error
|
||
* `5`: internal error
|
||
|
||
## Command: `qa`
|
||
|
||
Purpose:
|
||
|
||
* Run post-render QA analysis and evaluate quality gates.
|
||
|
||
Args:
|
||
|
||
* `--pdf <path>`: PDF to analyze (default: `out/render.pdf`)
|
||
* `--html <path>`: optional HTML for cross-checks
|
||
* `--gates <path>`: override gates file (default: `spec/quality_gates.yaml`)
|
||
|
||
Outputs:
|
||
|
||
* `out/layout-report.json`
|
||
* `out/qa-report.json`
|
||
|
||
Exit codes:
|
||
|
||
* `0`: all gates pass
|
||
* `2`: gates failed (at or above `--fail-on` / strictness)
|
||
* `4`: config/schema error
|
||
* `5`: internal error
|
||
|
||
## Command: `report`
|
||
|
||
Purpose:
|
||
|
||
* Produce a consolidated report:
|
||
|
||
* coverage (implemented vs unimplemented)
|
||
* diffs vs baseline
|
||
* per-category enforcement breakdown
|
||
|
||
Args:
|
||
|
||
* `--baseline <path>`: baseline coverage report to diff against
|
||
* `--since <gitref>`: optionally compute diffs since a git ref (implementation-defined)
|
||
|
||
Outputs:
|
||
|
||
* `out/coverage-report.json`
|
||
* `out/coverage-diff.json` (if baseline provided)
|
||
* `out/coverage-summary.md`
|
||
|
||
Exit codes:
|
||
|
||
* `0`: report built and coverage passes configured floors
|
||
* `2`: coverage floor violated
|
||
* `4`: config/schema error
|
||
* `5`: internal error
|
||
|
||
--- END FILE ---
|
||
|
||
--- FILE: spec/extraction_plan.md ---
|
||
|
||
# Phase 2 Extraction Plan
|
||
|
||
This plan defines how rules will be produced in controlled batches without reproducing the books.
|
||
|
||
## Non-negotiables (carried into Phase 2)
|
||
|
||
* No full-book OCR/transcription.
|
||
* No long verbatim passages.
|
||
* Rules are paraphrased and capped (`rule_text` ≤ 800 chars).
|
||
* Every rule includes at least one source pointer in `source_refs`.
|
||
* If a rule depends on exact wording, the rule still paraphrases but must include:
|
||
|
||
* `rule_text`: “Exact wording required—refer to pointer”
|
||
* plus a usable pointer.
|
||
|
||
Primary reference PDFs for pointer extraction:
|
||
|
||
* The Chicago Manual of Style (18th ed).pdf
|
||
* Robert Bringhurst – The Elements of Typographic Style.pdf
|
||
|
||
## Output batching format
|
||
|
||
When you say: `EXTRACT <CATEGORY> [<SCOPE>]`
|
||
|
||
I will output a bundle that includes:
|
||
|
||
1. **Rules NDJSON** (150–250 rule records)
|
||
|
||
* Path: `spec/rules/<category>/<batch_id>.ndjson`
|
||
* One JSON object per line, validated against `spec/schema/rule.schema.json`.
|
||
|
||
2. **Index deltas** for that category
|
||
|
||
* `spec/indexes/keywords_<category>.json`
|
||
* `spec/indexes/source_refs_<category>.json`
|
||
* `spec/indexes/coverage_delta_<category>.json`
|
||
|
||
3. **Coverage notes** report
|
||
|
||
* A short Markdown report describing enforcement split:
|
||
|
||
* lint vs typeset vs postrender vs manual
|
||
* plus any known gaps or “manual-only” areas
|
||
|
||
## Batch naming
|
||
|
||
`<batch_id>` format:
|
||
|
||
* `v1_<category>_<nnn>`
|
||
|
||
* e.g., `v1_punctuation_001`
|
||
|
||
Batches are append-only:
|
||
|
||
* If rules need revision, mark old rule `deprecated`, add a new rule ID (or new version segment) and keep both records.
|
||
|
||
## Pointer scheme details
|
||
|
||
Pointer strings live in `source_refs[]` and are **not** quotes.
|
||
|
||
Preferred pointer format:
|
||
|
||
* `CMOS18 §<section> p<book_page>`
|
||
* `BRING §<section> p<book_page>`
|
||
* Optional disambiguation: `(scan p<pdf_page_index>)`
|
||
|
||
Example pattern (not a quote):
|
||
|
||
* `CMOS18 §6.1 p377 (scan p10)`
|
||
|
||
Notes:
|
||
|
||
* “book_page” uses the printed page number in the book when present (arabic or roman).
|
||
* “scan p” uses the PDF page index when printed page numbers are ambiguous.
|
||
|
||
## Recommended extraction order (high-impact first)
|
||
|
||
1. numbers
|
||
2. punctuation
|
||
3. citations
|
||
4. headings
|
||
5. tables
|
||
6. figures
|
||
7. links
|
||
8. code
|
||
9. layout (widows/orphans, keeps, overflow)
|
||
10. front/back matter
|
||
11. accessibility
|
||
12. i18n
|
||
|
||
Rationale:
|
||
|
||
* Numbers/punctuation/citations most directly affect correctness, consistency, and auditability.
|
||
* Layout rules benefit from having structure and tokens in place.
|
||
|
||
## Scope parameter
|
||
|
||
`[<SCOPE>]` can constrain extraction, e.g.:
|
||
|
||
* `EXTRACT punctuation basic`
|
||
* `EXTRACT citations notes_bibliography`
|
||
* `EXTRACT numbers en_only`
|
||
* `EXTRACT layout widows_orphans`
|
||
|
||
If scope is omitted:
|
||
|
||
* extract the most generally applicable rules for that category first.
|
||
|
||
## Enforcement mapping guidelines (honest labeling)
|
||
|
||
* `lint`: detectable from AST or text normalization (e.g., spacing, punctuation patterns, citation fields).
|
||
* `typeset`: enforced via CSS/tokens/paged-media decisions (e.g., indent policy, measure targets, hyphenation params).
|
||
* `postrender`: requires layout inspection after rendering (e.g., widows/orphans, overfull lines, table clipping).
|
||
* `manual`: cannot be reliably automated; must include `tags: ["manual_checklist=true"]` and be emitted into checklist outputs.
|
||
|
||
If a concept spans multiple enforcement layers:
|
||
|
||
* Prefer splitting into two rules:
|
||
|
||
* one lint rule (source cleanliness)
|
||
* one postrender rule (layout outcome)
|
||
* Use `dependencies` to link them.
|
||
|
||
## Extraction workflow per category (repeatable)
|
||
|
||
For each category:
|
||
|
||
1. Build a topic map (subtopics, recurring failure modes).
|
||
2. Extract rules in clusters:
|
||
|
||
* MUST rules first (enforceable or checklist)
|
||
* SHOULD rules next
|
||
* WARN rules last
|
||
3. For each rule:
|
||
|
||
* add `keywords` for searchability
|
||
* add tags for overrides/routing
|
||
* add minimal `exceptions` when needed (avoid overfitting)
|
||
|
||
## “Degraded mode” considerations during extraction
|
||
|
||
For each category batch, include some rules that specifically target degraded inputs:
|
||
|
||
* hard-wrap repair suggestions
|
||
* heading inference warnings
|
||
* link sanitation and encoding fixes
|
||
* Unicode normalization notes
|
||
|
||
These rules should generally be `warn` or `should`, unless they prevent corruption (then `must`).
|
||
|
||
--- END FILE ---
|