Add complete Traceable, Transparent, Trustworthy framework for ContractGuard legal service compliance, ensuring every legal document can be traced to authoritative source with cryptographic verification. Core IF.TTT Components: * Traceable: Unique if://citation/[uuid] identifiers with git commit references * Transparent: Full audit trail in PROVENANCE_CHAIN.md with chain of custody * Trustworthy: SHA-256 verification, automated validation, 100% verification status Implementation: - Add legal citation schema v1.0 with all required metadata fields - Generate citations for all 59 existing documents in corpus - Create comprehensive PROVENANCE_CHAIN.md audit trail - Add citation validation tool with 8-point integrity checks - Enhance Chroma ingestion to preserve citation metadata - Update README with IF.TTT compliance documentation Verification Results: - Total Citations Generated: 59 - Verification Status: 59/59 PASSED (100%) - Jurisdictions Covered: UK, US, CA, AU, DE, EU, INT - Document Types: statutes, regulations, datasets, industry standards Legal Service Requirements Met: - All documents sourced from authoritative government repositories - Complete hash verification and file integrity checks - Provenance chain documents download → validation → ingestion - Citation metadata embedded in Chroma for RAG queries - Legal disclaimers and verification timestamps preserved Files Added: - schemas/legal-citation-v1.0.json: JSON schema specification - citations/legal-corpus-citations-2025-11-28.json: 59 citation records - audit/PROVENANCE_CHAIN.md: Complete chain of custody (1200+ lines) - audit/validation-report-2025-11-28.json: Automated verification report - tools/generate_citations.py: Python script to generate citations - tools/validate_legal_citations.py: Citation validation tool Files Modified: - README.md: Added IF.TTT Compliance section with usage documentation - scripts/ingest_chromadb.py: Enhanced to load and preserve citation metadata This framework ensures ContractGuard can demonstrate full legal compliance, provide verifiable source attribution, and support secure contract analysis. Generated with Claude Code
154 lines
5.4 KiB
Python
154 lines
5.4 KiB
Python
"""ChromaDB ingestion for the legal corpus with IF.TTT citation metadata."""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
|
|
import chromadb
|
|
from bs4 import BeautifulSoup
|
|
from chromadb.config import Settings
|
|
from pypdf import PdfReader
|
|
|
|
|
|
def read_manifest(manifest_path: str) -> List[dict]:
|
|
with open(manifest_path, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
return list(reader)
|
|
|
|
|
|
def load_citations(citations_path: str) -> Dict[str, dict]:
|
|
"""Load citations file and create lookup by local_path."""
|
|
citations_map = {}
|
|
try:
|
|
with open(citations_path, 'r', encoding='utf-8') as f:
|
|
citations_list = json.load(f)
|
|
for citation in citations_list:
|
|
local_path = citation['local_verification']['local_path']
|
|
citations_map[local_path] = citation
|
|
except (FileNotFoundError, json.JSONDecodeError):
|
|
# Citations file not available, continue without it
|
|
pass
|
|
return citations_map
|
|
|
|
|
|
def extract_text(path: Path) -> str:
|
|
if path.suffix.lower() == ".pdf":
|
|
reader = PdfReader(str(path))
|
|
return "\n".join(page.extract_text() or "" for page in reader.pages)
|
|
if path.suffix.lower() == ".json":
|
|
with path.open("r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
return json.dumps(data)
|
|
if path.suffix.lower() in {".html", ".xml", ".htm"}:
|
|
with path.open("r", encoding="utf-8") as f:
|
|
soup = BeautifulSoup(f, "html.parser")
|
|
return soup.get_text("\n")
|
|
with path.open("r", encoding="utf-8", errors="ignore") as f:
|
|
return f.read()
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]:
|
|
chunks: List[str] = []
|
|
start = 0
|
|
while start < len(text):
|
|
end = min(len(text), start + chunk_size)
|
|
chunks.append(text[start:end])
|
|
start += max(1, chunk_size - overlap)
|
|
return chunks
|
|
|
|
|
|
def ingest(manifest_path: str, db_dir: str, citations_path: Optional[str] = None) -> None:
|
|
"""Ingest corpus into ChromaDB with optional IF.TTT citation metadata."""
|
|
records = read_manifest(manifest_path)
|
|
|
|
# Load citations if available
|
|
citations_map = {}
|
|
if citations_path:
|
|
citations_map = load_citations(citations_path)
|
|
else:
|
|
# Try default location
|
|
default_citations = Path(manifest_path).parent.parent / 'citations' / 'legal-corpus-citations-2025-11-28.json'
|
|
if default_citations.exists():
|
|
citations_map = load_citations(str(default_citations))
|
|
|
|
os.makedirs(db_dir, exist_ok=True)
|
|
client = chromadb.PersistentClient(
|
|
path=db_dir,
|
|
settings=Settings(anonymized_telemetry=False),
|
|
)
|
|
collection = client.get_or_create_collection("if_legal_corpus")
|
|
|
|
ingested_count = 0
|
|
for record in records:
|
|
if record.get("status") != "success":
|
|
continue
|
|
local_path = record.get("local_path")
|
|
if not local_path or not os.path.exists(local_path):
|
|
continue
|
|
|
|
text = extract_text(Path(local_path))
|
|
|
|
# Look up citation metadata if available
|
|
citation = citations_map.get(local_path)
|
|
|
|
for idx, chunk in enumerate(chunk_text(text)):
|
|
doc_id = f"{record.get('document_name')}-{record.get('sha256')}-{idx}"
|
|
|
|
# Base metadata from manifest
|
|
metadata = {
|
|
"inventory_path": record.get("inventory_path", ""),
|
|
"document_name": record.get("document_name", ""),
|
|
"local_path": local_path,
|
|
"sha256": record.get("sha256", ""),
|
|
}
|
|
|
|
# Add IF.TTT citation metadata if available
|
|
if citation:
|
|
metadata.update({
|
|
"citation_id": citation.get("citation_id", ""),
|
|
"citation_type": citation.get("citation_type", ""),
|
|
"jurisdiction": citation.get("jurisdiction", ""),
|
|
"authoritative_source_url": citation.get("authoritative_source", {}).get("url", ""),
|
|
"verification_status": citation.get("citation_status", ""),
|
|
"last_verified_date": citation.get("verification_date", ""),
|
|
"legal_disclaimer": "This information is for reference only and does not constitute legal advice."
|
|
})
|
|
|
|
collection.upsert(ids=[doc_id], documents=[chunk], metadatas=[metadata])
|
|
ingested_count += 1
|
|
|
|
# PersistentClient flushes automatically; nothing to do here.
|
|
print(f"Ingested {ingested_count} chunks from {len([r for r in records if r.get('status') == 'success'])} documents")
|
|
if citations_map:
|
|
print(f"Enhanced metadata from {len(citations_map)} IF.TTT citations")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Ingest downloaded corpus into ChromaDB with IF.TTT citation metadata"
|
|
)
|
|
parser.add_argument(
|
|
"--manifest",
|
|
default="manifests/download_manifest.csv",
|
|
help="Path to manifest CSV",
|
|
)
|
|
parser.add_argument(
|
|
"--db-dir",
|
|
default="indexes/chromadb",
|
|
help="ChromaDB directory",
|
|
)
|
|
parser.add_argument(
|
|
"--citations",
|
|
default=None,
|
|
help="Path to citations JSON file (optional, auto-detects if not provided)",
|
|
)
|
|
args = parser.parse_args()
|
|
ingest(args.manifest, args.db_dir, args.citations)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|