Source code for pyfetcher.crawler.spider

"""Spider and router for :mod:`pyfetcher.crawler`.

Purpose:
    Provide a base spider class with URL pattern routing for handling
    different page types during crawling.
"""

from __future__ import annotations

import re
from collections.abc import Callable, Coroutine
from dataclasses import dataclass, field
from typing import Any

from pyfetcher.contracts.response import FetchResponse



[docs]
@dataclass
class SpiderResult:
    """Result of processing a crawled page.

    Args:
        discovered_urls: New URLs found on the page.
        items: Extracted structured data items.
        media_urls: Media URLs found for downloading.
    """

    discovered_urls: list[str] = field(default_factory=list)
    items: list[dict[str, Any]] = field(default_factory=list)
    media_urls: list[str] = field(default_factory=list)



HandlerFunc = Callable[[str, FetchResponse], Coroutine[Any, Any, SpiderResult]]



[docs]
class Router:
    """URL pattern router for spider handlers.

    Maps URL regex patterns to async handler functions. The first
    matching pattern wins.
    """

    def __init__(self) -> None:
        self._routes: list[tuple[re.Pattern[str], HandlerFunc]] = []
        self._default: HandlerFunc | None = None


[docs]
    def add(self, pattern: str, handler: HandlerFunc) -> None:
        """Register a handler for a URL pattern.

        Args:
            pattern: Regex pattern to match URLs against.
            handler: Async function handling matching URLs.
        """
        self._routes.append((re.compile(pattern), handler))



[docs]
    def default(self, handler: HandlerFunc) -> None:
        """Set the default handler for unmatched URLs.

        Args:
            handler: Async function for URLs matching no pattern.
        """
        self._default = handler



[docs]
    def resolve(self, url: str) -> HandlerFunc | None:
        """Find the handler for a URL.

        Args:
            url: The URL to route.

        Returns:
            The matching handler, or the default handler, or ``None``.
        """
        for pattern, handler in self._routes:
            if pattern.search(url):
                return handler
        return self._default





[docs]
class Spider:
    """Base spider with URL routing.

    Provides a router for dispatching URLs to handler functions
    that extract data and discover new URLs.

    Args:
        name: Spider name for logging/identification.
    """

    def __init__(self, name: str = "default") -> None:
        self.name = name
        self.router = Router()


[docs]
    async def handle(self, url: str, response: FetchResponse) -> SpiderResult:
        """Route a URL to its handler and return the result.

        Args:
            url: The crawled URL.
            response: The fetch response.

        Returns:
            A :class:`SpiderResult` with discovered URLs and items.
        """
        handler = self.router.resolve(url)
        if handler is None:
            return SpiderResult()
        return await handler(url, response)