Source code for pyfetcher.crawler.frontier

"""URL frontier (priority queue) for :mod:`pyfetcher.crawler`.

Purpose:
    Manage the URL crawl queue backed by Postgres. Implements the
    dual-queue pattern: priority-based selection with per-host
    politeness enforcement.
"""

from __future__ import annotations

import uuid

from pyfetcher.crawler.dedup import URLDeduplicator


[docs] class Frontier: """Postgres-backed URL frontier with dedup and priority. Combines job creation, dedup checking, and priority management into a single interface for the crawl stage. Args: deduplicator: URL dedup checker. """ def __init__(self, deduplicator: URLDeduplicator | None = None) -> None: self._dedup = deduplicator or URLDeduplicator()
[docs] async def add_url( self, session: object, url: str, *, priority: int = 0, parent_job_id: uuid.UUID | None = None, ) -> uuid.UUID | None: """Add a URL to the frontier if not already seen. Args: session: Async database session. url: The URL to add. priority: Crawl priority (higher = more urgent). parent_job_id: Optional parent job for traceability. Returns: The new job UUID, or ``None`` if the URL was already seen. """ if await self._dedup.is_seen(session, url): return None await self._dedup.mark_seen(session, url) from pyfetcher.db.repo import create_job job = await create_job( session, # type: ignore[arg-type] job_type="crawl", url=url, priority=priority, parent_job_id=parent_job_id, ) return job.id
[docs] async def add_urls( self, session: object, urls: list[str], *, priority: int = 0, parent_job_id: uuid.UUID | None = None, ) -> list[uuid.UUID]: """Add multiple URLs, skipping duplicates. Args: session: Async database session. urls: URLs to add. priority: Crawl priority. parent_job_id: Optional parent job. Returns: List of created job UUIDs (excludes dupes). """ created = [] for url in urls: job_id = await self.add_url( session, url, priority=priority, parent_job_id=parent_job_id ) if job_id is not None: created.append(job_id) return created