Source code for pyfetcher.crawler.frontier
"""URL frontier (priority queue) for :mod:`pyfetcher.crawler`.
Purpose:
Manage the URL crawl queue backed by Postgres. Implements the
dual-queue pattern: priority-based selection with per-host
politeness enforcement.
"""
from __future__ import annotations
import uuid
from pyfetcher.crawler.dedup import URLDeduplicator
[docs]
class Frontier:
"""Postgres-backed URL frontier with dedup and priority.
Combines job creation, dedup checking, and priority management
into a single interface for the crawl stage.
Args:
deduplicator: URL dedup checker.
"""
def __init__(self, deduplicator: URLDeduplicator | None = None) -> None:
self._dedup = deduplicator or URLDeduplicator()
[docs]
async def add_url(
self,
session: object,
url: str,
*,
priority: int = 0,
parent_job_id: uuid.UUID | None = None,
) -> uuid.UUID | None:
"""Add a URL to the frontier if not already seen.
Args:
session: Async database session.
url: The URL to add.
priority: Crawl priority (higher = more urgent).
parent_job_id: Optional parent job for traceability.
Returns:
The new job UUID, or ``None`` if the URL was already seen.
"""
if await self._dedup.is_seen(session, url):
return None
await self._dedup.mark_seen(session, url)
from pyfetcher.db.repo import create_job
job = await create_job(
session, # type: ignore[arg-type]
job_type="crawl",
url=url,
priority=priority,
parent_job_id=parent_job_id,
)
return job.id
[docs]
async def add_urls(
self,
session: object,
urls: list[str],
*,
priority: int = 0,
parent_job_id: uuid.UUID | None = None,
) -> list[uuid.UUID]:
"""Add multiple URLs, skipping duplicates.
Args:
session: Async database session.
urls: URLs to add.
priority: Crawl priority.
parent_job_id: Optional parent job.
Returns:
List of created job UUIDs (excludes dupes).
"""
created = []
for url in urls:
job_id = await self.add_url(
session, url, priority=priority, parent_job_id=parent_job_id
)
if job_id is not None:
created.append(job_id)
return created