if-legal-corpus/scripts/download_http.py

104 lines
3 KiB
Python

"""HTTP downloader helper."""
from __future__ import annotations
import hashlib
import os
from dataclasses import asdict, dataclass
from typing import Dict, Optional
import requests
@dataclass
class DownloadResult:
inventory_path: str
document_name: str
url: str
local_path: str
status: str
bytes: int = 0
sha256: str = ""
notes: str = ""
http_status: Optional[int] = None
def to_dict(self) -> Dict[str, str]:
data = asdict(self)
# Align field name with manifest expectation
data["url_used"] = data.pop("url")
return data
def download_file(url: str, out_path: str, timeout: int = 30) -> Dict[str, str]:
"""Download a file over HTTP to ``out_path`` with SHA-256 integrity.
Returns a manifest-friendly dictionary with status and metadata.
"""
resp = requests.get(url, stream=True, timeout=timeout)
resp.raise_for_status()
os.makedirs(os.path.dirname(out_path), exist_ok=True)
h = hashlib.sha256()
bytes_written = 0
with open(out_path, "wb") as f:
for chunk in resp.iter_content(32768):
if chunk:
f.write(chunk)
h.update(chunk)
bytes_written += len(chunk)
return {
"url": url,
"local_path": out_path,
"bytes": bytes_written,
"sha256": h.hexdigest(),
"status": "success",
"notes": "",
}
def safe_http_download(url: str, out_path: str, inventory_path: str, document_name: str) -> DownloadResult:
try:
info = download_file(url, out_path)
return DownloadResult(
inventory_path=inventory_path,
document_name=document_name,
url=url,
local_path=out_path,
status="success",
bytes=info.get("bytes", 0),
sha256=info.get("sha256", ""),
notes=info.get("notes", ""),
http_status=200,
)
except requests.exceptions.HTTPError as exc:
status_code = exc.response.status_code if exc.response else None
status = "requires_login" if status_code and status_code in {401, 403} else "error"
return DownloadResult(
inventory_path=inventory_path,
document_name=document_name,
url=url,
local_path=out_path,
status=status,
notes=f"HTTP error: {exc}",
http_status=status_code,
)
except requests.exceptions.SSLError as exc:
return DownloadResult(
inventory_path=inventory_path,
document_name=document_name,
url=url,
local_path=out_path,
status="error",
notes=f"SSL error: {exc}",
)
except requests.exceptions.RequestException as exc:
return DownloadResult(
inventory_path=inventory_path,
document_name=document_name,
url=url,
local_path=out_path,
status="error",
notes=f"Request error: {exc}",
)
__all__ = ["download_file", "safe_http_download", "DownloadResult"]