Source code for pyfetcher.config

"""Application configuration for :mod:`pyfetcher`.

Purpose:
    Provide a centralized, environment-aware configuration object using
    ``pydantic-settings``. Reads from environment variables (prefixed with
    ``PYFETCHER_``) and ``.env`` files.

Design:
    - All infrastructure connection details (Postgres, MinIO) are configurable.
    - Pipeline concurrency and behavior are tunable per deployment.
    - Defaults are suitable for local development with Docker Compose.

Examples:
    ::

        >>> config = PyfetcherConfig()
        >>> config.database_url
        'postgresql+asyncpg://pyfetcher:pyfetcher@localhost:5432/pyfetcher'
"""

from __future__ import annotations

from pydantic_settings import BaseSettings, SettingsConfigDict



[docs]
class PyfetcherConfig(BaseSettings):
    """Centralized configuration for pyfetcher infrastructure and pipeline.

    Reads from environment variables prefixed with ``PYFETCHER_`` and
    from ``.env`` files. Defaults are suitable for local development
    with the provided Docker Compose setup.

    Args:
        database_url: SQLAlchemy async connection string for PostgreSQL.
        db_pool_size: Base connection pool size for asyncpg.
        db_max_overflow: Maximum overflow connections beyond pool_size.
        minio_endpoint: MinIO server endpoint (host:port).
        minio_access_key: MinIO access key.
        minio_secret_key: MinIO secret key.
        minio_secure: Whether to use HTTPS for MinIO connections.
        minio_bucket: Default bucket name for storing assets.
        crawl_concurrency: Maximum concurrent crawl workers.
        scrape_concurrency: Maximum concurrent scrape workers.
        download_concurrency: Maximum concurrent download workers.
        default_crawl_delay_seconds: Default politeness delay between
            requests to the same host.
        max_retries: Default maximum retry attempts for failed jobs.
    """

    model_config = SettingsConfigDict(
        env_prefix="PYFETCHER_",
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )

    # Database
    database_url: str = "postgresql+asyncpg://pyfetcher:pyfetcher@localhost:5432/pyfetcher"
    db_pool_size: int = 10
    db_max_overflow: int = 20

    # MinIO / S3
    minio_endpoint: str = "localhost:9000"
    minio_access_key: str = "minioadmin"
    minio_secret_key: str = "minioadmin"
    minio_secure: bool = False
    minio_bucket: str = "pyfetcher"

    # Pipeline concurrency
    crawl_concurrency: int = 10
    scrape_concurrency: int = 20
    download_concurrency: int = 5
    default_crawl_delay_seconds: float = 1.0
    max_retries: int = 3