if-legal-corpus/scripts/ingest_chromadb.py

"""ChromaDB ingestion for the legal corpus with IF.TTT citation metadata."""
from __future__ import annotations

import argparse
import csv
import json
import os
from pathlib import Path
from typing import List, Dict, Optional

import chromadb
from bs4 import BeautifulSoup
from chromadb.config import Settings
from pypdf import PdfReader


def read_manifest(manifest_path: str) -> List[dict]:
    with open(manifest_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        return list(reader)


def load_citations(citations_path: str) -> Dict[str, dict]:
    """Load citations file and create lookup by local_path."""
    citations_map = {}
    try:
        with open(citations_path, 'r', encoding='utf-8') as f:
            citations_list = json.load(f)
            for citation in citations_list:
                local_path = citation['local_verification']['local_path']
                citations_map[local_path] = citation
    except (FileNotFoundError, json.JSONDecodeError):
        # Citations file not available, continue without it
        pass
    return citations_map


def extract_text(path: Path) -> str:
    if path.suffix.lower() == ".pdf":
        reader = PdfReader(str(path))
        return "\n".join(page.extract_text() or "" for page in reader.pages)
    if path.suffix.lower() == ".json":
        with path.open("r", encoding="utf-8") as f:
            data = json.load(f)
        return json.dumps(data)
    if path.suffix.lower() in {".html", ".xml", ".htm"}:
        with path.open("r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser")
        return soup.get_text("\n")
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]:
    chunks: List[str] = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunks.append(text[start:end])
        start += max(1, chunk_size - overlap)
    return chunks


def ingest(manifest_path: str, db_dir: str, citations_path: Optional[str] = None) -> None:
    """Ingest corpus into ChromaDB with optional IF.TTT citation metadata."""
    records = read_manifest(manifest_path)

    # Load citations if available
    citations_map = {}
    if citations_path:
        citations_map = load_citations(citations_path)
    else:
        # Try default location
        default_citations = Path(manifest_path).parent.parent / 'citations' / 'legal-corpus-citations-2025-11-28.json'
        if default_citations.exists():
            citations_map = load_citations(str(default_citations))

    os.makedirs(db_dir, exist_ok=True)
    client = chromadb.PersistentClient(
        path=db_dir,
        settings=Settings(anonymized_telemetry=False),
    )
    collection = client.get_or_create_collection("if_legal_corpus")

    ingested_count = 0
    for record in records:
        if record.get("status") != "success":
            continue
        local_path = record.get("local_path")
        if not local_path or not os.path.exists(local_path):
            continue

        text = extract_text(Path(local_path))

        # Look up citation metadata if available
        citation = citations_map.get(local_path)

        for idx, chunk in enumerate(chunk_text(text)):
            doc_id = f"{record.get('document_name')}-{record.get('sha256')}-{idx}"

            # Base metadata from manifest
            metadata = {
                "inventory_path": record.get("inventory_path", ""),
                "document_name": record.get("document_name", ""),
                "local_path": local_path,
                "sha256": record.get("sha256", ""),
            }

            # Add IF.TTT citation metadata if available
            if citation:
                metadata.update({
                    "citation_id": citation.get("citation_id", ""),
                    "citation_type": citation.get("citation_type", ""),
                    "jurisdiction": citation.get("jurisdiction", ""),
                    "authoritative_source_url": citation.get("authoritative_source", {}).get("url", ""),
                    "verification_status": citation.get("citation_status", ""),
                    "last_verified_date": citation.get("verification_date", ""),
                    "legal_disclaimer": "This information is for reference only and does not constitute legal advice."
                })

            collection.upsert(ids=[doc_id], documents=[chunk], metadatas=[metadata])
            ingested_count += 1

    # PersistentClient flushes automatically; nothing to do here.
    print(f"Ingested {ingested_count} chunks from {len([r for r in records if r.get('status') == 'success'])} documents")
    if citations_map:
        print(f"Enhanced metadata from {len(citations_map)} IF.TTT citations")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Ingest downloaded corpus into ChromaDB with IF.TTT citation metadata"
    )
    parser.add_argument(
        "--manifest",
        default="manifests/download_manifest.csv",
        help="Path to manifest CSV",
    )
    parser.add_argument(
        "--db-dir",
        default="indexes/chromadb",
        help="ChromaDB directory",
    )
    parser.add_argument(
        "--citations",
        default=None,
        help="Path to citations JSON file (optional, auto-detects if not provided)",
    )
    args = parser.parse_args()
    ingest(args.manifest, args.db_dir, args.citations)


if __name__ == "__main__":
    main()