Source code for pyfetcher.crawler.discovery

"""URL discovery (sitemaps + seeds) for :mod:`pyfetcher.crawler`.

Purpose:
    Discover URLs from sitemaps, robots.txt sitemap directives,
    and seed URL lists for populating the crawl frontier.
"""

from __future__ import annotations

from pyfetcher.scrape.robots import parse_robots_txt
from pyfetcher.scrape.sitemap import parse_sitemap


[docs] def discover_sitemaps_from_robots(robots_txt: str) -> list[str]: """Extract sitemap URLs from robots.txt content. Args: robots_txt: Raw robots.txt content. Returns: A list of sitemap URLs. """ rules = parse_robots_txt(robots_txt) return list(rules.sitemaps)
[docs] def discover_urls_from_sitemap(sitemap_xml: str) -> list[str]: """Extract URLs from a sitemap XML document. Handles both URL sitemaps and sitemap index files. For index files, returns the child sitemap URLs (not final page URLs). Args: sitemap_xml: Raw sitemap XML content. Returns: A list of discovered URLs. """ entries = parse_sitemap(sitemap_xml) return [e.loc for e in entries]
[docs] def build_seed_urls( *, urls: list[str] | None = None, robots_txt: str | None = None, sitemap_xml: str | None = None, ) -> list[str]: """Build a combined list of seed URLs from multiple sources. Args: urls: Explicit seed URLs. robots_txt: robots.txt content (extracts sitemap URLs). sitemap_xml: Sitemap XML content (extracts page URLs). Returns: A deduplicated list of seed URLs. """ result: list[str] = [] seen: set[str] = set() for url in urls or []: if url not in seen: result.append(url) seen.add(url) if robots_txt: for url in discover_sitemaps_from_robots(robots_txt): if url not in seen: result.append(url) seen.add(url) if sitemap_xml: for url in discover_urls_from_sitemap(sitemap_xml): if url not in seen: result.append(url) seen.add(url) return result