if-legal-corpus/scripts/download_govinfo.py

80 lines
2.5 KiB
Python

"""Downloader for GovInfo bulk/REST endpoints."""
from __future__ import annotations
from typing import Dict, Optional
import requests
from download_http import DownloadResult
def download_govinfo(item: Dict, api_key: Optional[str] = None) -> DownloadResult:
"""Attempt to download a GovInfo item.
``item`` should include ``inventory_path``, ``document_name``, ``url`` (or ``api_endpoint``) and ``local_path``.
"""
url = item.get("url") or item.get("api_endpoint")
inventory_path = item.get("inventory_path", "us_federal")
document_name = item.get("document_name", "unknown")
local_path = item.get("local_path", "raw/us_federal/unknown")
headers = {}
if api_key:
headers["X-Api-Key"] = api_key
try:
resp = requests.get(url, headers=headers, stream=True, timeout=30)
if resp.status_code in {401, 403}:
return DownloadResult(
inventory_path,
document_name,
url,
local_path,
"requires_login",
http_status=resp.status_code,
notes="GovInfo requires API key; set GOVINFO_API_KEY.",
)
if resp.status_code == 429:
return DownloadResult(
inventory_path,
document_name,
url,
local_path,
"rate_limited",
http_status=429,
notes=resp.headers.get("Retry-After", "Rate limited"),
)
resp.raise_for_status()
except requests.exceptions.RequestException as exc: # type: ignore[attr-defined]
return DownloadResult(inventory_path, document_name, url, local_path, "error", notes=str(exc))
# Stream to disk
bytes_written = 0
sha256 = ""
try:
import hashlib
import os
os.makedirs(os.path.dirname(local_path), exist_ok=True)
h = hashlib.sha256()
for chunk in resp.iter_content(32768):
if chunk:
with open(local_path, "ab") as f:
f.write(chunk)
h.update(chunk)
bytes_written += len(chunk)
sha256 = h.hexdigest()
except Exception as exc: # noqa: BLE001
return DownloadResult(inventory_path, document_name, url, local_path, "error", notes=str(exc))
return DownloadResult(
inventory_path,
document_name,
url,
local_path,
"success",
bytes=bytes_written,
sha256=sha256,
http_status=resp.status_code,
)
__all__ = ["download_govinfo"]