if-legal-corpus/scripts/download_all.py

342 lines
11 KiB
Python

"""Orchestrator for downloading items listed in LEGAL_CORPUS_IMPORT_LIST.md."""
from __future__ import annotations
import argparse
import csv
import datetime as dt
import os
import re
from typing import Dict, List
from slugify import slugify
import download_http
from download_austlii import download_austlii
from download_canlii import download_canlii
from download_courtlistener import download_courtlistener
from download_datasets import download_dataset
from download_ecfr import download_ecfr
from download_eurlex import download_eurlex
from download_govinfo import download_govinfo
def _infer_subdir(inventory_path: str) -> str:
"""Infer a raw/ subdirectory from the inventory section path."""
inventory_path = inventory_path.lower()
if inventory_path.startswith("1. us federal law"):
return "us_federal"
if inventory_path.startswith("2. us state law"):
return "us_state"
if inventory_path.startswith("3. european union"):
return "eu"
if inventory_path.startswith("4. germany"):
return "germany"
if inventory_path.startswith("5. france"):
return "france"
if inventory_path.startswith("6. canada"):
return "canada"
if inventory_path.startswith("7. australia"):
return "australia"
if inventory_path.startswith("8. united kingdom"):
return "uk"
if inventory_path.startswith("9. contract datasets"):
return "datasets"
if inventory_path.startswith("10. landmark case law"):
return "caselaw"
if inventory_path.startswith("11. industry standards"):
return "industry"
return "misc"
def parse_inventory(path: str) -> List[Dict]:
"""Parse the markdown inventory into a list of items.
The parser walks headings and markdown tables, extracting any row with an HTTP(S) URL.
"""
items: List[Dict] = []
if not os.path.exists(path):
return items
current_section = ""
current_subsection = ""
headers: List[str] = []
def parse_row(line: str) -> List[str]:
return [cell.strip() for cell in line.strip().strip("|").split("|")]
with open(path, "r", encoding="utf-8") as f:
for raw_line in f:
line = raw_line.rstrip("\n")
stripped = line.strip()
if stripped.startswith("## "):
current_section = stripped[3:].strip()
current_subsection = ""
headers = []
continue
if stripped.startswith("### "):
current_subsection = stripped[4:].strip()
headers = []
continue
if not stripped.startswith("|"):
continue
# Separator row like |-----|----|
if set(stripped.replace("|", "").replace("-", "").replace(" ", "")) == set():
continue
cells = parse_row(stripped)
# Header row
if headers == []:
headers = cells
continue
# Data row
row = dict(zip(headers, cells))
def clean(value: str) -> str:
return re.sub(r"\*\*", "", value).strip()
row = {k: clean(v) for k, v in row.items()}
# Extract URL
url = ""
for key, value in row.items():
if "http://" in value or "https://" in value:
# Take the first URL-like token
match = re.search(r"https?://\S+", value)
if match:
url = match.group(0).rstrip(")")
break
# Build document name from common column names
doc_name = (
row.get("Document")
or row.get("Dataset")
or row.get("Case")
or row.get("Article")
or row.get("Section")
or row.get(headers[0], "document")
)
subject = row.get("Subject") or row.get("Key Issue") or row.get("Standard")
if subject:
doc_name = f"{doc_name} - {subject}"
inventory_path = " / ".join(p for p in (current_section, current_subsection) if p)
subdir = _infer_subdir(current_section or "")
item: Dict = {
"inventory_path": inventory_path or "unspecified",
"document_name": doc_name,
"url": url,
"priority": row.get("Priority", ""),
"subdir": subdir,
}
if "CELEX ID" in row:
item["celex_id"] = row["CELEX ID"]
# Flag datasets for special handling
if current_section.lower().startswith("9. contract datasets"):
item["type"] = "dataset"
# For rows without URLs (for example, many case law entries), keep them so they
# appear in the manifest with `no_direct_link`.
if not url:
item["no_direct_link"] = True
items.append(item)
return items
def write_manifest_row(manifest_path: str, result: Dict[str, str]) -> None:
exists = os.path.exists(manifest_path)
with open(manifest_path, "a", newline="", encoding="utf-8") as f:
fieldnames = [
"inventory_path",
"document_name",
"url_used",
"local_path",
"status",
"bytes",
"sha256",
"notes",
]
writer = csv.DictWriter(f, fieldnames=fieldnames)
if not exists:
writer.writeheader()
writer.writerow({k: result.get(k, "") for k in fieldnames})
def log_download(log_path: str, result: Dict[str, str]) -> None:
exists = os.path.exists(log_path)
with open(log_path, "a", newline="", encoding="utf-8") as f:
fieldnames = [
"timestamp",
"inventory_path",
"document_name",
"url_used",
"local_path",
"status",
"bytes",
"sha256",
"notes",
]
writer = csv.DictWriter(f, fieldnames=fieldnames)
if not exists:
writer.writeheader()
row = {k: result.get(k, "") for k in fieldnames if k != "timestamp"}
row["timestamp"] = dt.datetime.utcnow().isoformat()
writer.writerow(row)
def choose_downloader(item: Dict) -> str:
url = item.get("url", "") or ""
if item.get("type") == "dataset":
return "dataset"
if "courtlistener" in url:
return "courtlistener"
if "govinfo.gov" in url:
return "govinfo"
if "ecfr.gov" in url:
return "ecfr"
if "eur-lex.europa.eu" in url:
return "eurlex"
if "canlii.org" in url:
return "canlii"
if "austlii.edu.au" in url or "austlii" in url:
return "austlii"
return "http"
def process_item(item: Dict, args) -> Dict[str, str]:
inventory_path = item.get("inventory_path", "unspecified")
document_name = item.get("document_name", "document")
url = item.get("url", "")
# Items like many case law entries have no direct link yet.
if item.get("no_direct_link") and not url:
from download_http import DownloadResult
result = DownloadResult(
inventory_path=inventory_path,
document_name=document_name,
url="",
local_path="",
status="no_direct_link",
notes="No direct URL in inventory; extend downloader to handle by citation or identifier.",
)
return result.to_dict()
downloader = choose_downloader(item)
safe_name = slugify(document_name) or "document"
subdir = item.get("subdir") or _infer_subdir(inventory_path)
base_dir = os.path.join("raw", subdir)
os.makedirs(base_dir, exist_ok=True)
filename = safe_name
# Preserve file extension when URL ends with something obvious
for ext in (".pdf", ".html", ".htm", ".xml", ".json"):
if url.lower().endswith(ext):
filename = f"{safe_name}{ext}"
break
local_path = os.path.join(base_dir, filename)
item["local_path"] = local_path
if downloader == "http":
result = download_http.safe_http_download(url, local_path, inventory_path, document_name)
elif downloader == "govinfo":
result = download_govinfo(item, api_key=args.govinfo_api_key)
elif downloader == "ecfr":
result = download_ecfr(item)
elif downloader == "eurlex":
# Prefer CELEX-based download if CELEX ID is present.
if item.get("celex_id"):
result = download_eurlex(item)
else:
result = download_http.safe_http_download(url, local_path, inventory_path, document_name)
elif downloader == "canlii":
result = download_canlii(item)
elif downloader == "austlii":
result = download_austlii(item)
elif downloader == "courtlistener":
result = download_courtlistener(item, token=args.courtlistener_token)
elif downloader == "dataset":
result = download_dataset(item)
else:
from download_http import DownloadResult
result = DownloadResult(
inventory_path,
document_name,
url,
local_path,
"error",
notes="Unknown downloader",
)
return result.to_dict()
def main() -> None:
parser = argparse.ArgumentParser(description="Download legal corpus inventory")
parser.add_argument(
"--inventory",
default="LEGAL_CORPUS_IMPORT_LIST.md",
help="Inventory markdown file",
)
parser.add_argument(
"--concurrency",
type=int,
default=1,
help="Not used yet; reserved for future parallelism",
)
parser.add_argument(
"--retry-status",
default=None,
help="If set, only retry items with this status",
)
parser.add_argument(
"--govinfo-api-key",
dest="govinfo_api_key",
default=os.environ.get("GOVINFO_API_KEY"),
)
parser.add_argument(
"--courtlistener-token",
dest="courtlistener_token",
default=os.environ.get("COURTLISTENER_TOKEN"),
)
args = parser.parse_args()
inventory_items = parse_inventory(args.inventory)
manifest_path = "manifests/download_manifest.csv"
log_path = "logs/download_log.csv"
if not inventory_items:
placeholder = {
"inventory_path": "pending_inventory",
"document_name": "pending_inventory",
"url_used": "",
"local_path": "",
"status": "pending_inventory",
"bytes": "",
"sha256": "",
"notes": "Inventory missing; provide LEGAL_CORPUS_IMPORT_LIST.md",
}
write_manifest_row(manifest_path, placeholder)
log_download(log_path, placeholder)
print("Inventory missing; wrote placeholder manifest row.")
return
os.makedirs(os.path.dirname(manifest_path), exist_ok=True)
os.makedirs(os.path.dirname(log_path), exist_ok=True)
for item in inventory_items:
result = process_item(item, args)
write_manifest_row(manifest_path, result)
log_download(log_path, result)
print(f"Processed {item.get('document_name')}: {result.get('status')}")
if __name__ == "__main__":
main()