Source code for pyfetcher.extractors.content
"""Article content extraction with fallback chain for :mod:`pyfetcher.extractors`.
Purpose:
Extract readable article text from HTML using trafilatura as the
primary extractor with readability-lxml as fallback.
Design:
trafilatura achieves the highest F1 score (0.945) in benchmarks.
readability-lxml has the highest median reliability (0.970).
We try trafilatura first, fall back to readability on failure.
"""
from __future__ import annotations
[docs]
def extract_article_text(html: str, *, url: str | None = None) -> str | None:
"""Extract the main article text from HTML.
Uses trafilatura as the primary extractor with readability-lxml
as fallback. Returns ``None`` if extraction fails entirely.
Args:
html: Raw HTML string.
url: Optional page URL for better extraction context.
Returns:
Extracted article text, or ``None``.
"""
text = _try_trafilatura(html, url=url)
if text:
return text
return _try_readability(html, url=url)
def _try_trafilatura(html: str, *, url: str | None = None) -> str | None:
"""Attempt extraction with trafilatura."""
try:
import trafilatura # type: ignore[import-untyped]
return trafilatura.extract(
html,
url=url,
include_comments=False,
include_tables=True,
no_fallback=False,
)
except Exception:
return None
def _try_readability(html: str, *, url: str | None = None) -> str | None:
"""Attempt extraction with readability-lxml."""
try:
from readability import Document # type: ignore[import-untyped]
doc = Document(html, url=url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(doc.summary(), "html.parser")
return soup.get_text(separator="\n", strip=True) or None
except Exception:
return None