Source code for pyfetcher.extractors.content

"""Article content extraction with fallback chain for :mod:`pyfetcher.extractors`.

Purpose:
    Extract readable article text from HTML using trafilatura as the
    primary extractor with readability-lxml as fallback.

Design:
    trafilatura achieves the highest F1 score (0.945) in benchmarks.
    readability-lxml has the highest median reliability (0.970).
    We try trafilatura first, fall back to readability on failure.
"""

from __future__ import annotations



[docs]
def extract_article_text(html: str, *, url: str | None = None) -> str | None:
    """Extract the main article text from HTML.

    Uses trafilatura as the primary extractor with readability-lxml
    as fallback. Returns ``None`` if extraction fails entirely.

    Args:
        html: Raw HTML string.
        url: Optional page URL for better extraction context.

    Returns:
        Extracted article text, or ``None``.
    """
    text = _try_trafilatura(html, url=url)
    if text:
        return text
    return _try_readability(html, url=url)



def _try_trafilatura(html: str, *, url: str | None = None) -> str | None:
    """Attempt extraction with trafilatura."""
    try:
        import trafilatura  # type: ignore[import-untyped]

        return trafilatura.extract(
            html,
            url=url,
            include_comments=False,
            include_tables=True,
            no_fallback=False,
        )
    except Exception:
        return None


def _try_readability(html: str, *, url: str | None = None) -> str | None:
    """Attempt extraction with readability-lxml."""
    try:
        from readability import Document  # type: ignore[import-untyped]

        doc = Document(html, url=url)
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(doc.summary(), "html.parser")
        return soup.get_text(separator="\n", strip=True) or None
    except Exception:
        return None