Source code for pyfetcher.scrape.selectors

"""CSS selector-based extraction for :mod:`pyfetcher`.

Purpose:
    Provide ergonomic functions for extracting data from HTML using CSS
    selectors via BeautifulSoup. Covers common patterns: selecting elements,
    extracting text, extracting attributes, and parsing HTML tables.

Examples:
    ::

        >>> html = "<div class='item'>Hello</div><div class='item'>World</div>"
        >>> extract_text(html, ".item")
        ['Hello', 'World']
"""

from __future__ import annotations

from typing import Any

from bs4 import BeautifulSoup, Tag



[docs]
def select(html: str, selector: str) -> list[Tag]:
    """Select all elements matching a CSS selector.

    Args:
        html: Raw HTML string to parse.
        selector: CSS selector string.

    Returns:
        A list of matching :class:`bs4.Tag` objects.

    Examples:
        ::

            >>> html = "<ul><li>A</li><li>B</li></ul>"
            >>> tags = select(html, "li")
            >>> len(tags)
            2
    """
    soup = BeautifulSoup(html, "html.parser")
    return soup.select(selector)




[docs]
def select_one(html: str, selector: str) -> Tag | None:
    """Select the first element matching a CSS selector.

    Args:
        html: Raw HTML string to parse.
        selector: CSS selector string.

    Returns:
        The first matching :class:`bs4.Tag`, or ``None`` if not found.

    Examples:
        ::

            >>> html = "<h1>Title</h1><h1>Subtitle</h1>"
            >>> tag = select_one(html, "h1")
            >>> tag.get_text()
            'Title'
    """
    soup = BeautifulSoup(html, "html.parser")
    return soup.select_one(selector)




[docs]
def extract_text(html: str, selector: str, *, strip: bool = True) -> list[str]:
    """Extract text content from all elements matching a CSS selector.

    Args:
        html: Raw HTML string to parse.
        selector: CSS selector string.
        strip: Whether to strip whitespace from each text result.

    Returns:
        A list of text strings from matching elements.

    Examples:
        ::

            >>> html = "<p>Hello</p><p>World</p>"
            >>> extract_text(html, "p")
            ['Hello', 'World']
    """
    tags = select(html, selector)
    return [tag.get_text(strip=strip) for tag in tags]




[docs]
def extract_attrs(
    html: str,
    selector: str,
    *,
    attrs: list[str] | None = None,
) -> list[dict[str, Any]]:
    """Extract attributes from all elements matching a CSS selector.

    If ``attrs`` is not specified, all attributes of each element are returned.
    If ``attrs`` is a list of attribute names, only those attributes are
    included (with ``None`` for missing attributes).

    Args:
        html: Raw HTML string to parse.
        selector: CSS selector string.
        attrs: Optional list of attribute names to extract.

    Returns:
        A list of dictionaries mapping attribute names to values.

    Examples:
        ::

            >>> html = '<a href="/about">About</a><a href="/home">Home</a>'
            >>> extract_attrs(html, "a", attrs=["href"])
            [{'href': '/about'}, {'href': '/home'}]
    """
    tags = select(html, selector)
    results: list[dict[str, Any]] = []
    for tag in tags:
        if attrs is None:
            results.append(dict(tag.attrs))
        else:
            results.append({attr: tag.get(attr) for attr in attrs})
    return results




[docs]
def extract_table(
    html: str,
    selector: str = "table",
    *,
    include_headers: bool = True,
) -> list[list[str]]:
    """Extract data from an HTML table as a list of rows.

    Parses the first ``<table>`` element matching the selector. If
    ``include_headers`` is ``True``, the first row will contain header
    cell (``<th>``) text. Subsequent rows contain data cell (``<td>``)
    text.

    Args:
        html: Raw HTML string to parse.
        selector: CSS selector targeting the table element.
        include_headers: Whether to include ``<th>`` cells as the first row.

    Returns:
        A list of rows, where each row is a list of cell text strings.

    Examples:
        ::

            >>> html = "<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>"
            >>> extract_table(html)
            [['Name'], ['Alice']]
    """
    soup = BeautifulSoup(html, "html.parser")
    table = soup.select_one(selector)
    if table is None:
        return []

    rows: list[list[str]] = []

    if include_headers:
        header_row = table.find("tr")
        if header_row and isinstance(header_row, Tag):
            headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])]
            if headers:
                rows.append(headers)

    for tr in table.find_all("tr"):
        if not isinstance(tr, Tag):
            continue
        cells = tr.find_all("td")
        if cells:
            rows.append([td.get_text(strip=True) for td in cells])

    return rows