Infrafabric-POC-docs/tools/apply_naming_refactor.py

113 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import re
from collections import Counter
from pathlib import Path
import yaml
SCAFFOLD_HEADING_RE = re.compile(r"^(#{1,6})\s+(HOOK|FLAW|SETUP|TENSION|TWIST|PUNCH):\s+(.*)\s*$")
def _build_legacy_map(registry: dict) -> dict[str, str]:
names = registry.get("names", {}) or {}
legacy_to_canonical: dict[str, str] = {}
for canonical, meta in names.items():
legacy = (meta or {}).get("legacy", []) or []
for legacy_token in legacy:
if legacy_token in legacy_to_canonical and legacy_to_canonical[legacy_token] != canonical:
raise ValueError(
f"legacy token {legacy_token!r} maps to multiple canonicals: "
f"{legacy_to_canonical[legacy_token]!r} and {canonical!r}"
)
legacy_to_canonical[legacy_token] = canonical
return legacy_to_canonical
def _safe_token_regex(token: str) -> re.Pattern[str]:
# Replace token only when its not inside a larger identifier/path.
# - Disallow alnum/_ on the left (word char) to avoid mid-word matches.
# - Disallow alnum/_ or '/' or '-' on the right to avoid touching URLs, paths, file names,
# or hyphenated “subtokens” like IF.guard-POC.
return re.compile(rf"(?<![A-Za-z0-9_]){re.escape(token)}(?![A-Za-z0-9_/-])")
def refactor_text(text: str, *, legacy_map: dict[str, str], skip_canonicals: set[str]) -> tuple[str, Counter[str]]:
stats: Counter[str] = Counter()
# 1) Remove visible scaffolding labels from headings.
out_lines: list[str] = []
for raw in text.splitlines(True):
line = raw.rstrip("\n")
match = SCAFFOLD_HEADING_RE.match(line)
if match:
hashes, _, title = match.groups()
out_lines.append(f"{hashes} {title}\n")
stats["__scaffold_heading_renames__"] += 1
continue
out_lines.append(raw)
out = "".join(out_lines)
# 2) Apply legacy → canonical replacements.
# Sort by length so longer tokens are replaced first (defensive; avoids any weird overlaps).
legacy_items = sorted(legacy_map.items(), key=lambda kv: len(kv[0]), reverse=True)
for legacy_token, canonical in legacy_items:
if canonical in skip_canonicals:
continue
pattern = _safe_token_regex(legacy_token)
out, n = pattern.subn(canonical, out)
if n:
stats[legacy_token] += n
return out, stats
def main() -> int:
parser = argparse.ArgumentParser(description="Apply IF naming refactor + remove scaffold headings.")
parser.add_argument(
"--registry",
type=Path,
default=Path(__file__).resolve().parents[1] / "IF_NAMING_REGISTRY.yaml",
help="Path to IF_NAMING_REGISTRY.yaml",
)
parser.add_argument(
"--file",
type=Path,
default=Path(__file__).resolve().parents[1] / "DANNY_STOCKER_INFRAFABRIC_DOSSIER.md",
help="Markdown file to refactor in-place",
)
parser.add_argument(
"--skip-canonical",
action="append",
default=["IF.AUDIT.TRAIL"],
help="Canonical name(s) to skip when applying legacy replacements (repeatable).",
)
args = parser.parse_args()
registry = yaml.safe_load(args.registry.read_text(encoding="utf-8"))
legacy_map = _build_legacy_map(registry)
content = args.file.read_text(encoding="utf-8")
updated, stats = refactor_text(
content, legacy_map=legacy_map, skip_canonicals=set(args.skip_canonical or [])
)
if updated == content:
print("no changes")
return 0
args.file.write_text(updated, encoding="utf-8")
print(f"updated: {args.file}")
if stats:
print("changes:")
for key, value in stats.most_common():
print(f" {key}: {value}")
return 0
if __name__ == "__main__":
raise SystemExit(main())