Source code for pyfetcher.scrape.robots

r"""Robots.txt parser for :mod:`pyfetcher`.

Purpose:
    Parse ``robots.txt`` files and check URL access permissions for a given
    user-agent. Supports ``Allow``, ``Disallow``, ``Crawl-delay``, and
    ``Sitemap`` directives.

Examples:
    ::

        >>> txt = "User-agent: *\\nDisallow: /admin"
        >>> rules = parse_robots_txt(txt)
        >>> is_allowed(rules, "/admin", user_agent="*")
        False
"""

from __future__ import annotations

from dataclasses import dataclass, field



[docs]
@dataclass
class RobotsRules:
    """Parsed robots.txt rules.

    Args:
        rules: Mapping of user-agent patterns to lists of (allow, path) tuples.
        sitemaps: List of sitemap URLs found in the robots.txt.
        crawl_delays: Mapping of user-agent patterns to crawl delay seconds.

    Examples:
        ::

            >>> rules = RobotsRules()
            >>> rules.sitemaps
            []
    """

    rules: dict[str, list[tuple[bool, str]]] = field(default_factory=dict)
    sitemaps: list[str] = field(default_factory=list)
    crawl_delays: dict[str, float] = field(default_factory=dict)




[docs]
def parse_robots_txt(content: str) -> RobotsRules:
    r"""Parse a robots.txt file content.

    Extracts ``User-agent``, ``Allow``, ``Disallow``, ``Crawl-delay``,
    and ``Sitemap`` directives into a structured :class:`RobotsRules`
    object.

    Args:
        content: The raw text content of a robots.txt file.

    Returns:
        A :class:`RobotsRules` object containing parsed directives.

    Examples:
        ::

            >>> txt = "User-agent: *\\nDisallow: /secret\\nAllow: /public"
            >>> rules = parse_robots_txt(txt)
            >>> len(rules.rules.get("*", []))
            2
    """
    result = RobotsRules()
    current_agents: list[str] = []

    for raw_line in content.splitlines():
        line = raw_line.split("#", 1)[0].strip()
        if not line:
            continue

        if ":" not in line:
            continue

        directive, _, value = line.partition(":")
        directive = directive.strip().lower()
        value = value.strip()

        if directive == "user-agent":
            current_agents = [value.lower()]
            for agent in current_agents:
                if agent not in result.rules:
                    result.rules[agent] = []
        elif directive == "disallow" and current_agents:
            for agent in current_agents:
                if agent not in result.rules:
                    result.rules[agent] = []
                if value:
                    result.rules[agent].append((False, value))
        elif directive == "allow" and current_agents:
            for agent in current_agents:
                if agent not in result.rules:
                    result.rules[agent] = []
                result.rules[agent].append((True, value))
        elif directive == "crawl-delay" and current_agents:
            try:
                delay = float(value)
                for agent in current_agents:
                    result.crawl_delays[agent] = delay
            except ValueError:
                pass
        elif directive == "sitemap":
            result.sitemaps.append(value)

    return result




[docs]
def is_allowed(
    rules: RobotsRules,
    path: str,
    *,
    user_agent: str = "*",
) -> bool:
    r"""Check if a path is allowed for the given user-agent.

    Evaluates the parsed robots.txt rules for the most specific matching
    user-agent. ``Allow`` directives take precedence over ``Disallow``
    when paths have equal specificity (longer path prefix wins).

    Args:
        rules: Parsed robots.txt rules from :func:`parse_robots_txt`.
        path: The URL path to check (e.g. ``'/admin/settings'``).
        user_agent: The user-agent string to check against. Defaults to
            ``'*'`` (wildcard).

    Returns:
        ``True`` if the path is allowed, ``False`` if disallowed.

    Examples:
        ::

            >>> txt = "User-agent: *\\nDisallow: /admin\\nAllow: /admin/public"
            >>> rules = parse_robots_txt(txt)
            >>> is_allowed(rules, "/admin/settings")
            False
            >>> is_allowed(rules, "/admin/public")
            True
    """
    agent_key = user_agent.lower()

    agent_rules = rules.rules.get(agent_key) or rules.rules.get("*", [])

    if not agent_rules:
        return True

    best_match: tuple[bool, int] | None = None

    for allowed, rule_path in agent_rules:
        if path.startswith(rule_path) or rule_path == "":
            specificity = len(rule_path)
            is_better = (
                best_match is None
                or specificity > best_match[1]
                or (specificity == best_match[1] and allowed)
            )
            if is_better:
                best_match = (allowed, specificity)

    if best_match is None:
        return True

    return best_match[0]