139 lines
4.8 KiB
Python
139 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import re
|
||
from collections import Counter
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
|
||
SCAFFOLD_HEADING_RE = re.compile(r"^(#{1,6})\s+(HOOK|FLAW|SETUP|TENSION|TWIST|PUNCH):\s+(.*)\s*$")
|
||
LEGACY_BLOCK_RE = re.compile(r"\(legacy:[^)]+\)")
|
||
|
||
|
||
def _shield_legacy_blocks(text: str) -> tuple[str, list[str]]:
|
||
"""
|
||
Prevent legacy-mapping annotations like `(legacy: ...)` from being rewritten by the refactor.
|
||
|
||
These blocks are documentation/traceability artifacts and should remain stable even as we
|
||
refactor the surrounding text to canonical names.
|
||
"""
|
||
blocks: list[str] = []
|
||
|
||
def repl(match: re.Match[str]) -> str:
|
||
blocks.append(match.group(0))
|
||
return f"__IF_LEGACY_BLOCK_{len(blocks) - 1}__"
|
||
|
||
return LEGACY_BLOCK_RE.sub(repl, text), blocks
|
||
|
||
|
||
def _unshield_legacy_blocks(text: str, blocks: list[str]) -> str:
|
||
for index, block in enumerate(blocks):
|
||
text = text.replace(f"__IF_LEGACY_BLOCK_{index}__", block)
|
||
return text
|
||
|
||
|
||
def _build_legacy_map(registry: dict) -> dict[str, str]:
|
||
names = registry.get("names", {}) or {}
|
||
legacy_to_canonical: dict[str, str] = {}
|
||
for canonical, meta in names.items():
|
||
legacy = (meta or {}).get("legacy", []) or []
|
||
for legacy_token in legacy:
|
||
if legacy_token in legacy_to_canonical and legacy_to_canonical[legacy_token] != canonical:
|
||
raise ValueError(
|
||
f"legacy token {legacy_token!r} maps to multiple canonicals: "
|
||
f"{legacy_to_canonical[legacy_token]!r} and {canonical!r}"
|
||
)
|
||
legacy_to_canonical[legacy_token] = canonical
|
||
return legacy_to_canonical
|
||
|
||
|
||
def _safe_token_regex(token: str) -> re.Pattern[str]:
|
||
# Replace token only when it’s not inside a larger identifier/path.
|
||
# - Disallow alnum/_ on the left (word char) to avoid mid-word matches.
|
||
# - Disallow alnum/_ or '/' or '-' on the right to avoid touching URLs, paths, file names,
|
||
# or hyphenated “subtokens” like IF.guard-POC.
|
||
return re.compile(rf"(?<![A-Za-z0-9_]){re.escape(token)}(?![A-Za-z0-9_/-])")
|
||
|
||
|
||
def refactor_text(text: str, *, legacy_map: dict[str, str], skip_canonicals: set[str]) -> tuple[str, Counter[str]]:
|
||
stats: Counter[str] = Counter()
|
||
|
||
# 1) Remove visible scaffolding labels from headings.
|
||
out_lines: list[str] = []
|
||
for raw in text.splitlines(True):
|
||
line = raw.rstrip("\n")
|
||
match = SCAFFOLD_HEADING_RE.match(line)
|
||
if match:
|
||
hashes, _, title = match.groups()
|
||
out_lines.append(f"{hashes} {title}\n")
|
||
stats["__scaffold_heading_renames__"] += 1
|
||
continue
|
||
out_lines.append(raw)
|
||
out = "".join(out_lines)
|
||
|
||
# Shield `(legacy: ...)` blocks so they don't become nonsense like `(legacy: IF.GOV.PANEL)`.
|
||
out, legacy_blocks = _shield_legacy_blocks(out)
|
||
|
||
# 2) Apply legacy → canonical replacements.
|
||
# Sort by length so longer tokens are replaced first (defensive; avoids any weird overlaps).
|
||
legacy_items = sorted(legacy_map.items(), key=lambda kv: len(kv[0]), reverse=True)
|
||
for legacy_token, canonical in legacy_items:
|
||
if canonical in skip_canonicals:
|
||
continue
|
||
pattern = _safe_token_regex(legacy_token)
|
||
out, n = pattern.subn(canonical, out)
|
||
if n:
|
||
stats[legacy_token] += n
|
||
|
||
out = _unshield_legacy_blocks(out, legacy_blocks)
|
||
return out, stats
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description="Apply IF naming refactor + remove scaffold headings.")
|
||
parser.add_argument(
|
||
"--registry",
|
||
type=Path,
|
||
default=Path(__file__).resolve().parents[1] / "IF_NAMING_REGISTRY.yaml",
|
||
help="Path to IF_NAMING_REGISTRY.yaml",
|
||
)
|
||
parser.add_argument(
|
||
"--file",
|
||
type=Path,
|
||
default=Path(__file__).resolve().parents[1] / "DANNY_STOCKER_INFRAFABRIC_DOSSIER.md",
|
||
help="Markdown file to refactor in-place",
|
||
)
|
||
parser.add_argument(
|
||
"--skip-canonical",
|
||
action="append",
|
||
default=["IF.AUDIT.TRAIL"],
|
||
help="Canonical name(s) to skip when applying legacy replacements (repeatable).",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
registry = yaml.safe_load(args.registry.read_text(encoding="utf-8"))
|
||
legacy_map = _build_legacy_map(registry)
|
||
|
||
content = args.file.read_text(encoding="utf-8")
|
||
updated, stats = refactor_text(
|
||
content, legacy_map=legacy_map, skip_canonicals=set(args.skip_canonical or [])
|
||
)
|
||
if updated == content:
|
||
print("no changes")
|
||
return 0
|
||
|
||
args.file.write_text(updated, encoding="utf-8")
|
||
|
||
print(f"updated: {args.file}")
|
||
if stats:
|
||
print("changes:")
|
||
for key, value in stats.most_common():
|
||
print(f" {key}: {value}")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|