if-legal-corpus/scripts/download_all.py

"""Orchestrator for downloading items listed in LEGAL_CORPUS_IMPORT_LIST.md."""
from __future__ import annotations

import argparse
import csv
import datetime as dt
import os
import re
from typing import Dict, List

from slugify import slugify

import download_http
from download_austlii import download_austlii
from download_canlii import download_canlii
from download_courtlistener import download_courtlistener
from download_datasets import download_dataset
from download_ecfr import download_ecfr
from download_eurlex import download_eurlex
from download_govinfo import download_govinfo


def _infer_subdir(inventory_path: str) -> str:
    """Infer a raw/ subdirectory from the inventory section path."""
    inventory_path = inventory_path.lower()
    if inventory_path.startswith("1. us federal law"):
        return "us_federal"
    if inventory_path.startswith("2. us state law"):
        return "us_state"
    if inventory_path.startswith("3. european union"):
        return "eu"
    if inventory_path.startswith("4. germany"):
        return "germany"
    if inventory_path.startswith("5. france"):
        return "france"
    if inventory_path.startswith("6. canada"):
        return "canada"
    if inventory_path.startswith("7. australia"):
        return "australia"
    if inventory_path.startswith("8. united kingdom"):
        return "uk"
    if inventory_path.startswith("9. contract datasets"):
        return "datasets"
    if inventory_path.startswith("10. landmark case law"):
        return "caselaw"
    if inventory_path.startswith("11. industry standards"):
        return "industry"
    return "misc"


def parse_inventory(path: str) -> List[Dict]:
    """Parse the markdown inventory into a list of items.

    The parser walks headings and markdown tables, extracting any row with an HTTP(S) URL.
    """
    items: List[Dict] = []
    if not os.path.exists(path):
        return items

    current_section = ""
    current_subsection = ""
    headers: List[str] = []

    def parse_row(line: str) -> List[str]:
        return [cell.strip() for cell in line.strip().strip("|").split("|")]

    with open(path, "r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.rstrip("\n")
            stripped = line.strip()

            if stripped.startswith("## "):
                current_section = stripped[3:].strip()
                current_subsection = ""
                headers = []
                continue
            if stripped.startswith("### "):
                current_subsection = stripped[4:].strip()
                headers = []
                continue

            if not stripped.startswith("|"):
                continue

            # Separator row like |-----|----|
            if set(stripped.replace("|", "").replace("-", "").replace(" ", "")) == set():
                continue

            cells = parse_row(stripped)
            # Header row
            if headers == []:
                headers = cells
                continue

            # Data row
            row = dict(zip(headers, cells))

            def clean(value: str) -> str:
                return re.sub(r"\*\*", "", value).strip()

            row = {k: clean(v) for k, v in row.items()}

            # Extract URL
            url = ""
            for key, value in row.items():
                if "http://" in value or "https://" in value:
                    # Take the first URL-like token
                    match = re.search(r"https?://\S+", value)
                    if match:
                        url = match.group(0).rstrip(")")
                        break

            # Build document name from common column names
            doc_name = (
                row.get("Document")
                or row.get("Dataset")
                or row.get("Case")
                or row.get("Article")
                or row.get("Section")
                or row.get(headers[0], "document")
            )
            subject = row.get("Subject") or row.get("Key Issue") or row.get("Standard")
            if subject:
                doc_name = f"{doc_name} - {subject}"

            inventory_path = " / ".join(p for p in (current_section, current_subsection) if p)
            subdir = _infer_subdir(current_section or "")

            item: Dict = {
                "inventory_path": inventory_path or "unspecified",
                "document_name": doc_name,
                "url": url,
                "priority": row.get("Priority", ""),
                "subdir": subdir,
            }

            if "CELEX ID" in row:
                item["celex_id"] = row["CELEX ID"]

            # Flag datasets for special handling
            if current_section.lower().startswith("9. contract datasets"):
                item["type"] = "dataset"

            # For rows without URLs (for example, many case law entries), keep them so they
            # appear in the manifest with `no_direct_link`.
            if not url:
                item["no_direct_link"] = True

            items.append(item)

    return items


def write_manifest_row(manifest_path: str, result: Dict[str, str]) -> None:
    exists = os.path.exists(manifest_path)
    with open(manifest_path, "a", newline="", encoding="utf-8") as f:
        fieldnames = [
            "inventory_path",
            "document_name",
            "url_used",
            "local_path",
            "status",
            "bytes",
            "sha256",
            "notes",
        ]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not exists:
            writer.writeheader()
        writer.writerow({k: result.get(k, "") for k in fieldnames})


def log_download(log_path: str, result: Dict[str, str]) -> None:
    exists = os.path.exists(log_path)
    with open(log_path, "a", newline="", encoding="utf-8") as f:
        fieldnames = [
            "timestamp",
            "inventory_path",
            "document_name",
            "url_used",
            "local_path",
            "status",
            "bytes",
            "sha256",
            "notes",
        ]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not exists:
            writer.writeheader()
        row = {k: result.get(k, "") for k in fieldnames if k != "timestamp"}
        row["timestamp"] = dt.datetime.utcnow().isoformat()
        writer.writerow(row)


def choose_downloader(item: Dict) -> str:
    url = item.get("url", "") or ""
    if item.get("type") == "dataset":
        return "dataset"
    if "courtlistener" in url:
        return "courtlistener"
    if "govinfo.gov" in url:
        return "govinfo"
    if "ecfr.gov" in url:
        return "ecfr"
    if "eur-lex.europa.eu" in url:
        return "eurlex"
    if "canlii.org" in url:
        return "canlii"
    if "austlii.edu.au" in url or "austlii" in url:
        return "austlii"
    return "http"


def process_item(item: Dict, args) -> Dict[str, str]:
    inventory_path = item.get("inventory_path", "unspecified")
    document_name = item.get("document_name", "document")
    url = item.get("url", "")

    # Items like many case law entries have no direct link yet.
    if item.get("no_direct_link") and not url:
        from download_http import DownloadResult

        result = DownloadResult(
            inventory_path=inventory_path,
            document_name=document_name,
            url="",
            local_path="",
            status="no_direct_link",
            notes="No direct URL in inventory; extend downloader to handle by citation or identifier.",
        )
        return result.to_dict()

    downloader = choose_downloader(item)
    safe_name = slugify(document_name) or "document"
    subdir = item.get("subdir") or _infer_subdir(inventory_path)
    base_dir = os.path.join("raw", subdir)
    os.makedirs(base_dir, exist_ok=True)
    filename = safe_name
    # Preserve file extension when URL ends with something obvious
    for ext in (".pdf", ".html", ".htm", ".xml", ".json"):
        if url.lower().endswith(ext):
            filename = f"{safe_name}{ext}"
            break
    local_path = os.path.join(base_dir, filename)
    item["local_path"] = local_path

    if downloader == "http":
        result = download_http.safe_http_download(url, local_path, inventory_path, document_name)
    elif downloader == "govinfo":
        result = download_govinfo(item, api_key=args.govinfo_api_key)
    elif downloader == "ecfr":
        result = download_ecfr(item)
    elif downloader == "eurlex":
        # Prefer CELEX-based download if CELEX ID is present.
        if item.get("celex_id"):
            result = download_eurlex(item)
        else:
            result = download_http.safe_http_download(url, local_path, inventory_path, document_name)
    elif downloader == "canlii":
        result = download_canlii(item)
    elif downloader == "austlii":
        result = download_austlii(item)
    elif downloader == "courtlistener":
        result = download_courtlistener(item, token=args.courtlistener_token)
    elif downloader == "dataset":
        result = download_dataset(item)
    else:
        from download_http import DownloadResult

        result = DownloadResult(
            inventory_path,
            document_name,
            url,
            local_path,
            "error",
            notes="Unknown downloader",
        )
    return result.to_dict()


def main() -> None:
    parser = argparse.ArgumentParser(description="Download legal corpus inventory")
    parser.add_argument(
        "--inventory",
        default="LEGAL_CORPUS_IMPORT_LIST.md",
        help="Inventory markdown file",
    )
    parser.add_argument(
        "--concurrency",
        type=int,
        default=1,
        help="Not used yet; reserved for future parallelism",
    )
    parser.add_argument(
        "--retry-status",
        default=None,
        help="If set, only retry items with this status",
    )
    parser.add_argument(
        "--govinfo-api-key",
        dest="govinfo_api_key",
        default=os.environ.get("GOVINFO_API_KEY"),
    )
    parser.add_argument(
        "--courtlistener-token",
        dest="courtlistener_token",
        default=os.environ.get("COURTLISTENER_TOKEN"),
    )
    args = parser.parse_args()

    inventory_items = parse_inventory(args.inventory)
    manifest_path = "manifests/download_manifest.csv"
    log_path = "logs/download_log.csv"

    if not inventory_items:
        placeholder = {
            "inventory_path": "pending_inventory",
            "document_name": "pending_inventory",
            "url_used": "",
            "local_path": "",
            "status": "pending_inventory",
            "bytes": "",
            "sha256": "",
            "notes": "Inventory missing; provide LEGAL_CORPUS_IMPORT_LIST.md",
        }
        write_manifest_row(manifest_path, placeholder)
        log_download(log_path, placeholder)
        print("Inventory missing; wrote placeholder manifest row.")
        return

    os.makedirs(os.path.dirname(manifest_path), exist_ok=True)
    os.makedirs(os.path.dirname(log_path), exist_ok=True)

    for item in inventory_items:
        result = process_item(item, args)
        write_manifest_row(manifest_path, result)
        log_download(log_path, result)
        print(f"Processed {item.get('document_name')}: {result.get('status')}")


if __name__ == "__main__":
    main()