Source code for pyfetcher.crawler.politeness

"""Politeness enforcement for :mod:`pyfetcher.crawler`.

Purpose:
    Enforce per-host crawl delays using robots.txt directives and
    configurable minimum request intervals.
"""

from __future__ import annotations

import asyncio
from datetime import UTC, datetime
from urllib.parse import urlparse

from pyfetcher.scrape.robots import is_allowed, parse_robots_txt



[docs]
class PolitenessEnforcer:
    """Enforces crawl politeness per-host.

    Checks robots.txt rules and enforces minimum delays between
    requests to the same host.

    Args:
        default_delay_seconds: Default delay when no crawl-delay directive exists.
    """

    def __init__(self, default_delay_seconds: float = 1.0) -> None:
        self._default_delay = default_delay_seconds
        self._last_fetch: dict[str, float] = {}


[docs]
    def extract_hostname(self, url: str) -> str:
        """Extract hostname from a URL.

        Args:
            url: The URL.

        Returns:
            The hostname string.
        """
        return urlparse(url).netloc



[docs]
    def check_robots(self, robots_txt: str | None, path: str, *, user_agent: str = "*") -> bool:
        """Check if a path is allowed by robots.txt.

        Args:
            robots_txt: Raw robots.txt content (None means allowed).
            path: The URL path to check.
            user_agent: User-agent string.

        Returns:
            ``True`` if allowed.
        """
        if robots_txt is None:
            return True
        rules = parse_robots_txt(robots_txt)
        return is_allowed(rules, path, user_agent=user_agent)



[docs]
    def get_crawl_delay(self, robots_txt: str | None) -> float:
        """Get the crawl delay from robots.txt or use default.

        Args:
            robots_txt: Raw robots.txt content.

        Returns:
            Delay in seconds.
        """
        if robots_txt is None:
            return self._default_delay
        rules = parse_robots_txt(robots_txt)
        return rules.crawl_delays.get("*", self._default_delay)



[docs]
    async def wait_for_host(self, hostname: str, delay_seconds: float) -> None:
        """Wait until it's safe to fetch from a host.

        Args:
            hostname: The target hostname.
            delay_seconds: Minimum delay between requests.
        """
        now = datetime.now(UTC).timestamp()
        last = self._last_fetch.get(hostname, 0.0)
        wait = delay_seconds - (now - last)
        if wait > 0:
            await asyncio.sleep(wait)
        self._last_fetch[hostname] = datetime.now(UTC).timestamp()