Source code for pyfetcher.scrape.content

"""Content extraction for :mod:`pyfetcher`.

Purpose:
    Extract readable text content from HTML by stripping scripts, styles,
    and navigation elements to isolate the main body text.

Examples:
    ::

        >>> html = "<html><body><p>Hello World</p><script>var x=1;</script></body></html>"
        >>> extract_readable_text(html)
        'Hello World'
"""

from __future__ import annotations

import re

from bs4 import BeautifulSoup

_STRIP_TAGS = frozenset(
    {
        "script",
        "style",
        "noscript",
        "iframe",
        "svg",
        "nav",
        "footer",
        "header",
    }
)

_WHITESPACE_RE = re.compile(r"\n{3,}")


[docs] def extract_readable_text( html: str, *, strip_tags: frozenset[str] | None = None, selector: str | None = None, ) -> str: r"""Extract readable text content from HTML. Removes scripts, styles, navigation, and other non-content elements from the HTML, then extracts and normalizes the text content. Optionally targets a specific element via CSS selector. Args: html: Raw HTML string to process. strip_tags: Set of tag names to remove before text extraction. Defaults to scripts, styles, noscript, iframe, svg, nav, footer, and header. selector: Optional CSS selector to narrow extraction to a specific element (e.g. ``'article'``, ``'main'``, ``'.content'``). Returns: Cleaned, readable text with normalized whitespace. Examples: :: >>> html = "<div><p>First.</p><p>Second.</p><script>x=1</script></div>" >>> extract_readable_text(html) 'First.\\nSecond.' """ soup = BeautifulSoup(html, "html.parser") tags_to_strip = strip_tags or _STRIP_TAGS for tag in soup.find_all(tags_to_strip): tag.decompose() target = soup if selector: selected = soup.select_one(selector) if selected: target = selected text = target.get_text(separator="\n", strip=True) text = _WHITESPACE_RE.sub("\n\n", text) return text.strip()