Source code for pyfetcher.scrape.content

"""Content extraction for :mod:`pyfetcher`.

Purpose:
    Extract readable text content from HTML by stripping scripts, styles,
    and navigation elements to isolate the main body text.

Examples:
    ::

        >>> html = "<html><body><p>Hello World</p><script>var x=1;</script></body></html>"
        >>> extract_readable_text(html)
        'Hello World'
"""

from __future__ import annotations

import re

from bs4 import BeautifulSoup

_STRIP_TAGS = frozenset(
    {
        "script",
        "style",
        "noscript",
        "iframe",
        "svg",
        "nav",
        "footer",
        "header",
    }
)

_WHITESPACE_RE = re.compile(r"\n{3,}")



[docs]
def extract_readable_text(
    html: str,
    *,
    strip_tags: frozenset[str] | None = None,
    selector: str | None = None,
) -> str:
    r"""Extract readable text content from HTML.

    Removes scripts, styles, navigation, and other non-content elements
    from the HTML, then extracts and normalizes the text content.
    Optionally targets a specific element via CSS selector.

    Args:
        html: Raw HTML string to process.
        strip_tags: Set of tag names to remove before text extraction.
            Defaults to scripts, styles, noscript, iframe, svg, nav,
            footer, and header.
        selector: Optional CSS selector to narrow extraction to a specific
            element (e.g. ``'article'``, ``'main'``, ``'.content'``).

    Returns:
        Cleaned, readable text with normalized whitespace.

    Examples:
        ::

            >>> html = "<div><p>First.</p><p>Second.</p><script>x=1</script></div>"
            >>> extract_readable_text(html)
            'First.\\nSecond.'
    """
    soup = BeautifulSoup(html, "html.parser")

    tags_to_strip = strip_tags or _STRIP_TAGS
    for tag in soup.find_all(tags_to_strip):
        tag.decompose()

    target = soup
    if selector:
        selected = soup.select_one(selector)
        if selected:
            target = selected

    text = target.get_text(separator="\n", strip=True)
    text = _WHITESPACE_RE.sub("\n\n", text)
    return text.strip()