if-legal-corpus/scripts/ingest_chromadb.py
codex-bot a7e96b9ac0 feat: implement IF.TTT framework for legal corpus traceability
Add complete Traceable, Transparent, Trustworthy framework for ContractGuard
legal service compliance, ensuring every legal document can be traced to
authoritative source with cryptographic verification.

Core IF.TTT Components:
* Traceable: Unique if://citation/[uuid] identifiers with git commit references
* Transparent: Full audit trail in PROVENANCE_CHAIN.md with chain of custody
* Trustworthy: SHA-256 verification, automated validation, 100% verification status

Implementation:
- Add legal citation schema v1.0 with all required metadata fields
- Generate citations for all 59 existing documents in corpus
- Create comprehensive PROVENANCE_CHAIN.md audit trail
- Add citation validation tool with 8-point integrity checks
- Enhance Chroma ingestion to preserve citation metadata
- Update README with IF.TTT compliance documentation

Verification Results:
- Total Citations Generated: 59
- Verification Status: 59/59 PASSED (100%)
- Jurisdictions Covered: UK, US, CA, AU, DE, EU, INT
- Document Types: statutes, regulations, datasets, industry standards

Legal Service Requirements Met:
- All documents sourced from authoritative government repositories
- Complete hash verification and file integrity checks
- Provenance chain documents download → validation → ingestion
- Citation metadata embedded in Chroma for RAG queries
- Legal disclaimers and verification timestamps preserved

Files Added:
- schemas/legal-citation-v1.0.json: JSON schema specification
- citations/legal-corpus-citations-2025-11-28.json: 59 citation records
- audit/PROVENANCE_CHAIN.md: Complete chain of custody (1200+ lines)
- audit/validation-report-2025-11-28.json: Automated verification report
- tools/generate_citations.py: Python script to generate citations
- tools/validate_legal_citations.py: Citation validation tool

Files Modified:
- README.md: Added IF.TTT Compliance section with usage documentation
- scripts/ingest_chromadb.py: Enhanced to load and preserve citation metadata

This framework ensures ContractGuard can demonstrate full legal compliance,
provide verifiable source attribution, and support secure contract analysis.

Generated with Claude Code
2025-11-28 04:21:54 +01:00

154 lines
5.4 KiB
Python

"""ChromaDB ingestion for the legal corpus with IF.TTT citation metadata."""
from __future__ import annotations
import argparse
import csv
import json
import os
from pathlib import Path
from typing import List, Dict, Optional
import chromadb
from bs4 import BeautifulSoup
from chromadb.config import Settings
from pypdf import PdfReader
def read_manifest(manifest_path: str) -> List[dict]:
with open(manifest_path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
return list(reader)
def load_citations(citations_path: str) -> Dict[str, dict]:
"""Load citations file and create lookup by local_path."""
citations_map = {}
try:
with open(citations_path, 'r', encoding='utf-8') as f:
citations_list = json.load(f)
for citation in citations_list:
local_path = citation['local_verification']['local_path']
citations_map[local_path] = citation
except (FileNotFoundError, json.JSONDecodeError):
# Citations file not available, continue without it
pass
return citations_map
def extract_text(path: Path) -> str:
if path.suffix.lower() == ".pdf":
reader = PdfReader(str(path))
return "\n".join(page.extract_text() or "" for page in reader.pages)
if path.suffix.lower() == ".json":
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
return json.dumps(data)
if path.suffix.lower() in {".html", ".xml", ".htm"}:
with path.open("r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
return soup.get_text("\n")
with path.open("r", encoding="utf-8", errors="ignore") as f:
return f.read()
def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]:
chunks: List[str] = []
start = 0
while start < len(text):
end = min(len(text), start + chunk_size)
chunks.append(text[start:end])
start += max(1, chunk_size - overlap)
return chunks
def ingest(manifest_path: str, db_dir: str, citations_path: Optional[str] = None) -> None:
"""Ingest corpus into ChromaDB with optional IF.TTT citation metadata."""
records = read_manifest(manifest_path)
# Load citations if available
citations_map = {}
if citations_path:
citations_map = load_citations(citations_path)
else:
# Try default location
default_citations = Path(manifest_path).parent.parent / 'citations' / 'legal-corpus-citations-2025-11-28.json'
if default_citations.exists():
citations_map = load_citations(str(default_citations))
os.makedirs(db_dir, exist_ok=True)
client = chromadb.PersistentClient(
path=db_dir,
settings=Settings(anonymized_telemetry=False),
)
collection = client.get_or_create_collection("if_legal_corpus")
ingested_count = 0
for record in records:
if record.get("status") != "success":
continue
local_path = record.get("local_path")
if not local_path or not os.path.exists(local_path):
continue
text = extract_text(Path(local_path))
# Look up citation metadata if available
citation = citations_map.get(local_path)
for idx, chunk in enumerate(chunk_text(text)):
doc_id = f"{record.get('document_name')}-{record.get('sha256')}-{idx}"
# Base metadata from manifest
metadata = {
"inventory_path": record.get("inventory_path", ""),
"document_name": record.get("document_name", ""),
"local_path": local_path,
"sha256": record.get("sha256", ""),
}
# Add IF.TTT citation metadata if available
if citation:
metadata.update({
"citation_id": citation.get("citation_id", ""),
"citation_type": citation.get("citation_type", ""),
"jurisdiction": citation.get("jurisdiction", ""),
"authoritative_source_url": citation.get("authoritative_source", {}).get("url", ""),
"verification_status": citation.get("citation_status", ""),
"last_verified_date": citation.get("verification_date", ""),
"legal_disclaimer": "This information is for reference only and does not constitute legal advice."
})
collection.upsert(ids=[doc_id], documents=[chunk], metadatas=[metadata])
ingested_count += 1
# PersistentClient flushes automatically; nothing to do here.
print(f"Ingested {ingested_count} chunks from {len([r for r in records if r.get('status') == 'success'])} documents")
if citations_map:
print(f"Enhanced metadata from {len(citations_map)} IF.TTT citations")
def main() -> None:
parser = argparse.ArgumentParser(
description="Ingest downloaded corpus into ChromaDB with IF.TTT citation metadata"
)
parser.add_argument(
"--manifest",
default="manifests/download_manifest.csv",
help="Path to manifest CSV",
)
parser.add_argument(
"--db-dir",
default="indexes/chromadb",
help="ChromaDB directory",
)
parser.add_argument(
"--citations",
default=None,
help="Path to citations JSON file (optional, auto-detects if not provided)",
)
args = parser.parse_args()
ingest(args.manifest, args.db_dir, args.citations)
if __name__ == "__main__":
main()