Source code for pyfetcher.crawler.dedup
"""URL deduplication for :mod:`pyfetcher.crawler`.
Purpose:
Normalize URLs and check/record seen status using xxhash64 for
fast Postgres-backed deduplication.
"""
from __future__ import annotations
import hashlib
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
[docs]
def normalize_url(url: str) -> str:
"""Normalize a URL for deduplication.
Strips fragments, sorts query params, lowercases scheme/host,
removes trailing slashes on paths, and removes default ports.
Args:
url: The URL to normalize.
Returns:
The normalized URL string.
"""
parsed = urlparse(url)
scheme = parsed.scheme.lower()
host = (parsed.hostname or "").lower()
port = parsed.port
if (scheme == "http" and port == 80) or (scheme == "https" and port == 443):
port = None
netloc = f"{host}:{port}" if port else host
path = parsed.path.rstrip("/") or "/"
query_params = parse_qs(parsed.query, keep_blank_values=True)
sorted_query = urlencode(sorted(query_params.items()), doseq=True)
return urlunparse((scheme, netloc, path, "", sorted_query, ""))
[docs]
def url_hash(url: str) -> int:
"""Compute a hash for a normalized URL.
Uses SHA-256 truncated to 8 bytes (64 bits) for a BigInteger-compatible
hash suitable for Postgres primary keys.
Args:
url: The URL to hash (should be pre-normalized).
Returns:
A 64-bit integer hash.
"""
digest = hashlib.sha256(url.encode()).digest()
return int.from_bytes(digest[:8], "big", signed=True)
[docs]
class URLDeduplicator:
"""URL deduplication checker backed by Postgres.
Normalizes URLs, hashes them, and checks/records them in the
``seen_urls`` table via the repository layer.
"""
[docs]
async def is_seen(self, session: object, url: str) -> bool:
"""Check if a URL has been seen before.
Args:
session: Async database session.
url: The URL to check.
Returns:
``True`` if the URL has been seen.
"""
from pyfetcher.db.repo import check_url_seen
normalized = normalize_url(url)
return await check_url_seen(session, url_hash(normalized)) # type: ignore[arg-type]
[docs]
async def mark_seen(self, session: object, url: str) -> None:
"""Mark a URL as seen.
Args:
session: Async database session.
url: The URL to mark.
"""
from pyfetcher.db.repo import mark_url_seen
normalized = normalize_url(url)
await mark_url_seen(session, url_hash=url_hash(normalized), url=url) # type: ignore[arg-type]