if-legal-corpus/scripts/download_ecfr.py

72 lines
2.3 KiB
Python

"""Downloader for the eCFR v1 API or HTML endpoints."""
from __future__ import annotations
from typing import Dict
import requests
from download_http import DownloadResult
def download_ecfr(item: Dict, api_base: str = "https://api.ecfr.gov/v1") -> DownloadResult:
inventory_path = item.get("inventory_path", "us_federal")
document_name = item.get("document_name", "unknown")
path = item.get("path") or item.get("url")
local_path = item.get("local_path", "raw/us_federal/ecfr_unknown.json")
url = path if path and path.startswith("http") else f"{api_base}{path}" if path else api_base
params = item.get("params") or {}
try:
resp = requests.get(url, params=params, timeout=30)
if resp.status_code == 429:
return DownloadResult(
inventory_path,
document_name,
url,
local_path,
"rate_limited",
http_status=429,
notes=resp.headers.get("Retry-After", "Rate limited"),
)
resp.raise_for_status()
except requests.exceptions.RequestException as exc: # type: ignore[attr-defined]
return DownloadResult(inventory_path, document_name, url, local_path, "error", notes=str(exc))
import os
os.makedirs(os.path.dirname(local_path), exist_ok=True)
try:
# Try JSON first, fall back to raw text
try:
content = resp.json()
import json
with open(local_path, "w", encoding="utf-8") as f:
json.dump(content, f)
pretty_path = f"{local_path}.pretty.json"
with open(pretty_path, "w", encoding="utf-8") as f:
json.dump(content, f, indent=2)
raw_bytes = resp.content
except ValueError:
with open(local_path, "w", encoding="utf-8") as f:
f.write(resp.text)
raw_bytes = resp.content
except Exception as exc: # noqa: BLE001
return DownloadResult(inventory_path, document_name, url, local_path, "error", notes=str(exc))
import hashlib
h = hashlib.sha256(raw_bytes)
return DownloadResult(
inventory_path,
document_name,
url,
local_path,
"success",
bytes=len(raw_bytes),
sha256=h.hexdigest(),
http_status=resp.status_code,
)
__all__ = ["download_ecfr"]