Source code for pyfetcher.downloaders.direct

"""Direct HTTP download with MinIO upload for :mod:`pyfetcher.downloaders`.

Purpose:
    Provide direct HTTP file downloads using pyfetcher's existing fetch
    infrastructure, with optional streaming to MinIO.
"""

from __future__ import annotations

import hashlib
import tempfile
from pathlib import Path

from pyfetcher.contracts.request import FetchRequest
from pyfetcher.downloaders.base import DownloadResult, MediaInfo
from pyfetcher.fetch.service import FetchService


[docs] class DirectDownloader: """Direct HTTP downloader using pyfetcher's FetchService. Streams files to disk using the existing streaming infrastructure, then optionally uploads to MinIO. Args: fetch_service: Optional FetchService instance. """ def __init__(self, *, fetch_service: FetchService | None = None) -> None: self._service = fetch_service or FetchService()
[docs] async def extract_info(self, url: str) -> list[MediaInfo]: """Extract info via HEAD request. Args: url: File URL. Returns: A list with one :class:`MediaInfo`. """ request = FetchRequest(url=url, method="HEAD") response = await self._service.afetch(request) filename = url.rsplit("/", 1)[-1] if "/" in url else None return [ MediaInfo( url=url, mime_type=response.content_type, file_size_bytes=int(response.headers.get("content-length", 0)) or None, ext=Path(filename).suffix if filename else None, ) ]
[docs] async def download( self, url: str, *, output_dir: str | None = None, progress_callback: object | None = None, ) -> list[DownloadResult]: """Download a file via HTTP streaming. Args: url: File URL. output_dir: Output directory. Uses temp dir if not provided. progress_callback: Not used for direct downloads. Returns: A list with one :class:`DownloadResult`. """ if output_dir is None: output_dir = tempfile.mkdtemp(prefix="pyfetcher-direct-") filename = url.rsplit("/", 1)[-1] if "/" in url else "download" dest = Path(output_dir) / filename dest.parent.mkdir(parents=True, exist_ok=True) request = FetchRequest(url=url) hasher = hashlib.sha256() total_bytes = 0 with dest.open("wb") as f: async for chunk in self._service.astream(request): f.write(chunk.data) hasher.update(chunk.data) total_bytes += len(chunk.data) return [ DownloadResult( source_url=url, local_path=str(dest), filename=filename, file_size_bytes=total_bytes, checksum_sha256=hasher.hexdigest(), ) ]