Source code for pyfetcher.crawler.spider
"""Spider and router for :mod:`pyfetcher.crawler`.
Purpose:
Provide a base spider class with URL pattern routing for handling
different page types during crawling.
"""
from __future__ import annotations
import re
from collections.abc import Callable, Coroutine
from dataclasses import dataclass, field
from typing import Any
from pyfetcher.contracts.response import FetchResponse
[docs]
@dataclass
class SpiderResult:
"""Result of processing a crawled page.
Args:
discovered_urls: New URLs found on the page.
items: Extracted structured data items.
media_urls: Media URLs found for downloading.
"""
discovered_urls: list[str] = field(default_factory=list)
items: list[dict[str, Any]] = field(default_factory=list)
media_urls: list[str] = field(default_factory=list)
HandlerFunc = Callable[[str, FetchResponse], Coroutine[Any, Any, SpiderResult]]
[docs]
class Router:
"""URL pattern router for spider handlers.
Maps URL regex patterns to async handler functions. The first
matching pattern wins.
"""
def __init__(self) -> None:
self._routes: list[tuple[re.Pattern[str], HandlerFunc]] = []
self._default: HandlerFunc | None = None
[docs]
def add(self, pattern: str, handler: HandlerFunc) -> None:
"""Register a handler for a URL pattern.
Args:
pattern: Regex pattern to match URLs against.
handler: Async function handling matching URLs.
"""
self._routes.append((re.compile(pattern), handler))
[docs]
def default(self, handler: HandlerFunc) -> None:
"""Set the default handler for unmatched URLs.
Args:
handler: Async function for URLs matching no pattern.
"""
self._default = handler
[docs]
def resolve(self, url: str) -> HandlerFunc | None:
"""Find the handler for a URL.
Args:
url: The URL to route.
Returns:
The matching handler, or the default handler, or ``None``.
"""
for pattern, handler in self._routes:
if pattern.search(url):
return handler
return self._default
[docs]
class Spider:
"""Base spider with URL routing.
Provides a router for dispatching URLs to handler functions
that extract data and discover new URLs.
Args:
name: Spider name for logging/identification.
"""
def __init__(self, name: str = "default") -> None:
self.name = name
self.router = Router()
[docs]
async def handle(self, url: str, response: FetchResponse) -> SpiderResult:
"""Route a URL to its handler and return the result.
Args:
url: The crawled URL.
response: The fetch response.
Returns:
A :class:`SpiderResult` with discovered URLs and items.
"""
handler = self.router.resolve(url)
if handler is None:
return SpiderResult()
return await handler(url, response)