Infrafabric-POC-docs/tools/apply_naming_refactor.py

139 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import re
from collections import Counter
from pathlib import Path
import yaml
SCAFFOLD_HEADING_RE = re.compile(r"^(#{1,6})\s+(HOOK|FLAW|SETUP|TENSION|TWIST|PUNCH):\s+(.*)\s*$")
LEGACY_BLOCK_RE = re.compile(r"\(legacy:[^)]+\)")
def _shield_legacy_blocks(text: str) -> tuple[str, list[str]]:
"""
Prevent legacy-mapping annotations like `(legacy: ...)` from being rewritten by the refactor.
These blocks are documentation/traceability artifacts and should remain stable even as we
refactor the surrounding text to canonical names.
"""
blocks: list[str] = []
def repl(match: re.Match[str]) -> str:
blocks.append(match.group(0))
return f"__IF_LEGACY_BLOCK_{len(blocks) - 1}__"
return LEGACY_BLOCK_RE.sub(repl, text), blocks
def _unshield_legacy_blocks(text: str, blocks: list[str]) -> str:
for index, block in enumerate(blocks):
text = text.replace(f"__IF_LEGACY_BLOCK_{index}__", block)
return text
def _build_legacy_map(registry: dict) -> dict[str, str]:
names = registry.get("names", {}) or {}
legacy_to_canonical: dict[str, str] = {}
for canonical, meta in names.items():
legacy = (meta or {}).get("legacy", []) or []
for legacy_token in legacy:
if legacy_token in legacy_to_canonical and legacy_to_canonical[legacy_token] != canonical:
raise ValueError(
f"legacy token {legacy_token!r} maps to multiple canonicals: "
f"{legacy_to_canonical[legacy_token]!r} and {canonical!r}"
)
legacy_to_canonical[legacy_token] = canonical
return legacy_to_canonical
def _safe_token_regex(token: str) -> re.Pattern[str]:
# Replace token only when its not inside a larger identifier/path.
# - Disallow alnum/_ on the left (word char) to avoid mid-word matches.
# - Disallow alnum/_ or '/' or '-' on the right to avoid touching URLs, paths, file names,
# or hyphenated “subtokens” like IF.guard-POC.
return re.compile(rf"(?<![A-Za-z0-9_]){re.escape(token)}(?![A-Za-z0-9_/-])")
def refactor_text(text: str, *, legacy_map: dict[str, str], skip_canonicals: set[str]) -> tuple[str, Counter[str]]:
stats: Counter[str] = Counter()
# 1) Remove visible scaffolding labels from headings.
out_lines: list[str] = []
for raw in text.splitlines(True):
line = raw.rstrip("\n")
match = SCAFFOLD_HEADING_RE.match(line)
if match:
hashes, _, title = match.groups()
out_lines.append(f"{hashes} {title}\n")
stats["__scaffold_heading_renames__"] += 1
continue
out_lines.append(raw)
out = "".join(out_lines)
# Shield `(legacy: ...)` blocks so they don't become nonsense like `(legacy: IF.GOV.PANEL)`.
out, legacy_blocks = _shield_legacy_blocks(out)
# 2) Apply legacy → canonical replacements.
# Sort by length so longer tokens are replaced first (defensive; avoids any weird overlaps).
legacy_items = sorted(legacy_map.items(), key=lambda kv: len(kv[0]), reverse=True)
for legacy_token, canonical in legacy_items:
if canonical in skip_canonicals:
continue
pattern = _safe_token_regex(legacy_token)
out, n = pattern.subn(canonical, out)
if n:
stats[legacy_token] += n
out = _unshield_legacy_blocks(out, legacy_blocks)
return out, stats
def main() -> int:
parser = argparse.ArgumentParser(description="Apply IF naming refactor + remove scaffold headings.")
parser.add_argument(
"--registry",
type=Path,
default=Path(__file__).resolve().parents[1] / "IF_NAMING_REGISTRY.yaml",
help="Path to IF_NAMING_REGISTRY.yaml",
)
parser.add_argument(
"--file",
type=Path,
default=Path(__file__).resolve().parents[1] / "DANNY_STOCKER_INFRAFABRIC_DOSSIER.md",
help="Markdown file to refactor in-place",
)
parser.add_argument(
"--skip-canonical",
action="append",
default=["IF.AUDIT.TRAIL"],
help="Canonical name(s) to skip when applying legacy replacements (repeatable).",
)
args = parser.parse_args()
registry = yaml.safe_load(args.registry.read_text(encoding="utf-8"))
legacy_map = _build_legacy_map(registry)
content = args.file.read_text(encoding="utf-8")
updated, stats = refactor_text(
content, legacy_map=legacy_map, skip_canonicals=set(args.skip_canonical or [])
)
if updated == content:
print("no changes")
return 0
args.file.write_text(updated, encoding="utf-8")
print(f"updated: {args.file}")
if stats:
print("changes:")
for key, value in stats.most_common():
print(f" {key}: {value}")
return 0
if __name__ == "__main__":
raise SystemExit(main())