Source code for pyfetcher.extractors.article

"""Article metadata extraction for :mod:`pyfetcher.extractors`.

Purpose:
    Extract article-specific metadata (author, publish date, summary)
    using newspaper3k for news articles.
"""

from __future__ import annotations

from dataclasses import dataclass, field


[docs] @dataclass(frozen=True, slots=True) class ArticleMeta: """Extracted article metadata.""" title: str | None = None authors: list[str] = field(default_factory=list) publish_date: str | None = None summary: str | None = None top_image: str | None = None keywords: list[str] = field(default_factory=list)
[docs] def extract_article_metadata(html: str, *, url: str) -> ArticleMeta: """Extract article metadata using newspaper3k. Args: html: Raw HTML string. url: The article URL (required by newspaper3k). Returns: An :class:`ArticleMeta` with extracted fields. """ try: from newspaper import Article # type: ignore[import-untyped] article = Article(url) article.set_html(html) article.parse() article.nlp() return ArticleMeta( title=article.title or None, authors=list(article.authors), publish_date=str(article.publish_date) if article.publish_date else None, summary=article.summary or None, top_image=article.top_image or None, keywords=list(article.keywords), ) except Exception: return ArticleMeta()