Source code for pyfetcher.crawler.feeds

"""RSS/Atom feed monitor for :mod:`pyfetcher.crawler`.

Purpose:
    Monitor RSS/Atom feeds for new entries with adaptive polling
    intervals based on publication frequency.
"""

from __future__ import annotations

import hashlib
from dataclasses import dataclass, field


[docs] @dataclass(frozen=True, slots=True) class FeedEntry: """A single feed entry.""" url: str title: str | None = None published: str | None = None summary: str | None = None
[docs] @dataclass class FeedPollResult: """Result of polling a feed.""" new_entries: list[FeedEntry] = field(default_factory=list) latest_entry_hash: str | None = None suggested_interval_minutes: int = 60
[docs] def parse_feed(content: str) -> list[FeedEntry]: """Parse RSS/Atom feed content into entries. Args: content: Raw feed XML/content. Returns: A list of :class:`FeedEntry` objects. """ import feedparser # type: ignore[import-untyped] feed = feedparser.parse(content) entries = [] for entry in feed.entries: link = entry.get("link", "") if not link: continue entries.append( FeedEntry( url=link, title=entry.get("title"), published=entry.get("published"), summary=entry.get("summary"), ) ) return entries
[docs] def compute_entry_hash(entry: FeedEntry) -> str: """Compute a hash for feed entry change detection. Args: entry: The feed entry. Returns: A hex digest string. """ data = f"{entry.url}|{entry.title or ''}".encode() return hashlib.sha256(data).hexdigest()[:16]
[docs] def calculate_poll_interval( entry_count: int, *, current_interval: int = 60, min_interval: int = 10, max_interval: int = 1440, ) -> int: """Calculate an adaptive polling interval based on new entry count. More new entries = shorter interval. No new entries = longer interval. Args: entry_count: Number of new entries found. current_interval: Current polling interval in minutes. min_interval: Minimum interval in minutes. max_interval: Maximum interval in minutes. Returns: Suggested interval in minutes. """ if entry_count >= 5: new_interval = max(current_interval // 2, min_interval) elif entry_count >= 1: new_interval = current_interval else: new_interval = min(current_interval * 2, max_interval) return new_interval