render: keep headings with next blocks
Some checks are pending
ci / ci (push) Waiting to run

This commit is contained in:
codex 2026-01-08 22:50:34 +00:00
parent f8808cecd1
commit 01d082b038
3 changed files with 194 additions and 63 deletions

View file

@ -261,6 +261,12 @@ def generate_profile_css(profile: dict[str, Any]) -> CssOutput:
css.append("}")
css.append("")
css.append(".if-keep-with-next {")
css.append(" break-inside: avoid;")
css.append(" page-break-inside: avoid;")
css.append("}")
css.append("")
css.append(f"h1 {{ font-size: {h1_size}; }}")
css.append(f"h2 {{ font-size: {h2_size}; }}")
css.append(f"h3 {{ font-size: {h3_size}; }}")

View file

@ -290,8 +290,12 @@ def _render_doc(
body_lines: list[str] = []
section_open = False
current_section_slug: str | None = None
keep_headings = _headings_keep_enabled(profile)
body_lines.append("<main>")
for block in doc.blocks:
blocks = doc.blocks
i = 0
while i < len(blocks):
block = blocks[i]
if block.type == "heading":
level = block.level or 1
slug = _unique_slug(block.text, heading_ids)
@ -301,71 +305,43 @@ def _render_doc(
body_lines.append(f"<section class=\"if-section if-section-{slug}\">")
section_open = True
current_section_slug = slug
body_lines.append(
heading_html = (
f"<h{level} id=\"{slug}\">{_render_inline(block.text, base_path, self_contained, warnings, inline_opts)}</h{level}>"
)
if keep_headings and level <= 3:
keep_indices = _keep_with_next_indices(blocks, i)
if keep_indices:
body_lines.append("<div class=\"if-keep-with-next\">")
body_lines.append(heading_html)
for j in keep_indices:
_render_non_heading_block(
blocks[j],
body_lines=body_lines,
profile=profile,
base_path=base_path,
self_contained=self_contained,
warnings=warnings,
inline_opts=inline_opts,
current_section_slug=current_section_slug,
)
body_lines.append("</div>")
i = max(keep_indices) + 1
continue
body_lines.append(heading_html)
i += 1
continue
if block.type == "paragraph":
img_only = _image_only_paragraph(block.text)
if img_only:
alt, url = img_only
src = _resolve_image_src(url, base_path, self_contained, warnings)
body_lines.append("<figure class=\"if-figure\">")
body_lines.append(f" <img src=\"{html.escape(src)}\" alt=\"{html.escape(alt)}\">")
body_lines.append("</figure>")
else:
ref_id = _leading_citation_id(block.text)
attr = f" id=\"{html.escape(ref_id)}\"" if ref_id else ""
body_lines.append(
f"<p{attr}>{_render_inline(block.text, base_path, self_contained, warnings, inline_opts)}</p>"
)
continue
if block.type == "list":
tag = "ol" if block.ordered else "ul"
body_lines.append(f"<{tag}>")
for item in block.items:
groomed = _groom_list_item_text(item, profile, current_section_slug)
rendered = _render_inline(groomed, base_path, self_contained, warnings, inline_opts)
rendered = rendered.replace(_NBSP_TOKEN, "&nbsp;")
body_lines.append(
f" <li>{rendered}</li>"
)
body_lines.append(f"</{tag}>")
continue
if block.type == "code":
lang = block.info.strip()
class_attr = f" class=\"language-{html.escape(lang)}\"" if lang else ""
body_lines.append(f"<pre><code{class_attr}>{html.escape(block.text)}</code></pre>")
continue
if block.type == "blockquote":
body_lines.append(
f"<blockquote><p>{_render_inline(block.text, base_path, self_contained, warnings, inline_opts)}</p></blockquote>"
)
continue
if block.type == "table":
numeric_cols, bool_cols = _table_column_classes(block.headers, block.rows)
body_lines.append("<table>")
body_lines.append(" <thead>")
body_lines.append(" <tr>")
for idx, h in enumerate(block.headers):
class_attr = _table_class_attr(idx, numeric_cols, bool_cols)
body_lines.append(
f" <th{class_attr}>{_render_inline(h, base_path, self_contained, warnings, inline_opts)}</th>"
)
body_lines.append(" </tr>")
body_lines.append(" </thead>")
body_lines.append(" <tbody>")
for row in block.rows:
body_lines.append(" <tr>")
for idx, cell in enumerate(row):
class_attr = _table_class_attr(idx, numeric_cols, bool_cols)
body_lines.append(
f" <td{class_attr}>{_render_inline(cell, base_path, self_contained, warnings, inline_opts)}</td>"
)
body_lines.append(" </tr>")
body_lines.append(" </tbody>")
body_lines.append("</table>")
continue
_render_non_heading_block(
block,
body_lines=body_lines,
profile=profile,
base_path=base_path,
self_contained=self_contained,
warnings=warnings,
inline_opts=inline_opts,
current_section_slug=current_section_slug,
)
i += 1
if section_open:
body_lines.append("</section>")
body_lines.append("</main>")
@ -386,6 +362,125 @@ def _render_doc(
return "\n".join(html_lines).rstrip() + "\n"
def _headings_keep_enabled(profile: dict[str, Any]) -> bool:
headings = profile.get("headings") or {}
if not isinstance(headings, dict):
return False
keep_lines = int(headings.get("keep_with_next_lines") or 0)
avoid_stranded = bool(headings.get("avoid_stranded_headings") or False)
return keep_lines > 0 or avoid_stranded
def _keep_with_next_indices(blocks: list[Any], heading_index: int) -> list[int]:
"""
Best-effort pagination guard: keep a heading with the first meaningful content block that follows.
This is designed to prevent headings from landing at the bottom of a page while a table/list/code block
is pushed to the next page. It is intentionally conservative and does not attempt precise line-count math.
"""
first = heading_index + 1
if first >= len(blocks):
return []
if getattr(blocks[first], "type", None) == "heading":
return []
# If the next block is a short paragraph and the following block is a table/list/code/blockquote,
# include both so the heading moves with the real content block.
indices: list[int] = []
max_meta_paragraphs = 2
max_meta_chars = 220
j = first
while j < len(blocks) and getattr(blocks[j], "type", None) == "paragraph" and len(indices) < max_meta_paragraphs:
text = getattr(blocks[j], "text", "") or ""
if len(text.strip()) > max_meta_chars:
break
indices.append(j)
j += 1
if not indices:
return [first]
if j < len(blocks) and getattr(blocks[j], "type", None) in {"table", "list", "code", "blockquote"}:
indices.append(j)
return indices
def _render_non_heading_block(
block: Any,
*,
body_lines: list[str],
profile: dict[str, Any],
base_path: Path,
self_contained: bool,
warnings: list[str],
inline_opts: InlineOptions,
current_section_slug: str | None,
) -> None:
if block.type == "paragraph":
img_only = _image_only_paragraph(block.text)
if img_only:
alt, url = img_only
src = _resolve_image_src(url, base_path, self_contained, warnings)
body_lines.append("<figure class=\"if-figure\">")
body_lines.append(f" <img src=\"{html.escape(src)}\" alt=\"{html.escape(alt)}\">")
body_lines.append("</figure>")
return
ref_id = _leading_citation_id(block.text)
attr = f" id=\"{html.escape(ref_id)}\"" if ref_id else ""
body_lines.append(f"<p{attr}>{_render_inline(block.text, base_path, self_contained, warnings, inline_opts)}</p>")
return
if block.type == "list":
tag = "ol" if block.ordered else "ul"
body_lines.append(f"<{tag}>")
for item in block.items:
groomed = _groom_list_item_text(item, profile, current_section_slug)
rendered = _render_inline(groomed, base_path, self_contained, warnings, inline_opts)
rendered = rendered.replace(_NBSP_TOKEN, "&nbsp;")
body_lines.append(f" <li>{rendered}</li>")
body_lines.append(f"</{tag}>")
return
if block.type == "code":
lang = (block.info or "").strip()
class_attr = f" class=\"language-{html.escape(lang)}\"" if lang else ""
body_lines.append(f"<pre><code{class_attr}>{html.escape(block.text)}</code></pre>")
return
if block.type == "blockquote":
body_lines.append(
f"<blockquote><p>{_render_inline(block.text, base_path, self_contained, warnings, inline_opts)}</p></blockquote>"
)
return
if block.type == "table":
numeric_cols, bool_cols = _table_column_classes(block.headers, block.rows)
body_lines.append("<table>")
body_lines.append(" <thead>")
body_lines.append(" <tr>")
for idx, header in enumerate(block.headers):
class_attr = _table_class_attr(idx, numeric_cols, bool_cols)
body_lines.append(
f" <th{class_attr}>{_render_inline(header, base_path, self_contained, warnings, inline_opts)}</th>"
)
body_lines.append(" </tr>")
body_lines.append(" </thead>")
body_lines.append(" <tbody>")
for row in block.rows:
body_lines.append(" <tr>")
for idx, cell in enumerate(row):
class_attr = _table_class_attr(idx, numeric_cols, bool_cols)
body_lines.append(
f" <td{class_attr}>{_render_inline(cell, base_path, self_contained, warnings, inline_opts)}</td>"
)
body_lines.append(" </tr>")
body_lines.append(" </tbody>")
body_lines.append("</table>")
return
def _doc_title(doc: MdDocument) -> str:
for block in doc.blocks:
if block.type == "heading" and block.level == 1:

View file

@ -2,6 +2,7 @@ import sys
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory
import re
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
@ -28,6 +29,35 @@ class RenderingInlineTests(unittest.TestCase):
self.assertIn("<strong>Summary:</strong>", html)
self.assertIn("Line one<br>", html)
def test_keep_with_next_wraps_heading_and_table(self) -> None:
with TemporaryDirectory() as tmpdir:
md_path = Path(tmpdir) / "table.md"
md_path.write_text(
"## 3) Tableau des chambres\n\n"
"Source: capture instantanée.\n\n"
"| Property | Room type |\n"
"| --- | --- |\n"
"| HO36 | Chambre |\n"
"| Le Flaneur | Dortoir |\n",
encoding="utf-8",
)
profile = {
"profile_id": "test",
"headings": {"avoid_stranded_headings": True, "keep_with_next_lines": 1},
}
result = render_html(md_path, profile)
html = result.html
match = re.search(
r'<div class="if-keep-with-next">\s*<h2[^>]*>3\) Tableau des chambres</h2>.*?<table>.*?</table>',
html,
flags=re.S,
)
self.assertIsNotNone(match)
table_html = match.group(0) if match else ""
self.assertEqual(table_html.count("<tr>"), 3) # header + 2 rows
self.assertEqual(table_html.count("</tr>"), 3)
if __name__ == "__main__":
unittest.main()