80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
"""Downloader for GovInfo bulk/REST endpoints."""
|
|
from __future__ import annotations
|
|
|
|
from typing import Dict, Optional
|
|
|
|
import requests
|
|
|
|
from download_http import DownloadResult
|
|
|
|
|
|
def download_govinfo(item: Dict, api_key: Optional[str] = None) -> DownloadResult:
|
|
"""Attempt to download a GovInfo item.
|
|
|
|
``item`` should include ``inventory_path``, ``document_name``, ``url`` (or ``api_endpoint``) and ``local_path``.
|
|
"""
|
|
url = item.get("url") or item.get("api_endpoint")
|
|
inventory_path = item.get("inventory_path", "us_federal")
|
|
document_name = item.get("document_name", "unknown")
|
|
local_path = item.get("local_path", "raw/us_federal/unknown")
|
|
headers = {}
|
|
if api_key:
|
|
headers["X-Api-Key"] = api_key
|
|
try:
|
|
resp = requests.get(url, headers=headers, stream=True, timeout=30)
|
|
if resp.status_code in {401, 403}:
|
|
return DownloadResult(
|
|
inventory_path,
|
|
document_name,
|
|
url,
|
|
local_path,
|
|
"requires_login",
|
|
http_status=resp.status_code,
|
|
notes="GovInfo requires API key; set GOVINFO_API_KEY.",
|
|
)
|
|
if resp.status_code == 429:
|
|
return DownloadResult(
|
|
inventory_path,
|
|
document_name,
|
|
url,
|
|
local_path,
|
|
"rate_limited",
|
|
http_status=429,
|
|
notes=resp.headers.get("Retry-After", "Rate limited"),
|
|
)
|
|
resp.raise_for_status()
|
|
except requests.exceptions.RequestException as exc: # type: ignore[attr-defined]
|
|
return DownloadResult(inventory_path, document_name, url, local_path, "error", notes=str(exc))
|
|
|
|
# Stream to disk
|
|
bytes_written = 0
|
|
sha256 = ""
|
|
try:
|
|
import hashlib
|
|
import os
|
|
|
|
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
h = hashlib.sha256()
|
|
for chunk in resp.iter_content(32768):
|
|
if chunk:
|
|
with open(local_path, "ab") as f:
|
|
f.write(chunk)
|
|
h.update(chunk)
|
|
bytes_written += len(chunk)
|
|
sha256 = h.hexdigest()
|
|
except Exception as exc: # noqa: BLE001
|
|
return DownloadResult(inventory_path, document_name, url, local_path, "error", notes=str(exc))
|
|
|
|
return DownloadResult(
|
|
inventory_path,
|
|
document_name,
|
|
url,
|
|
local_path,
|
|
"success",
|
|
bytes=bytes_written,
|
|
sha256=sha256,
|
|
http_status=resp.status_code,
|
|
)
|
|
|
|
|
|
__all__ = ["download_govinfo"]
|