Source code for pyfetcher.scrape.selectors

"""CSS selector-based extraction for :mod:`pyfetcher`.

Purpose:
    Provide ergonomic functions for extracting data from HTML using CSS
    selectors via BeautifulSoup. Covers common patterns: selecting elements,
    extracting text, extracting attributes, and parsing HTML tables.

Examples:
    ::

        >>> html = "<div class='item'>Hello</div><div class='item'>World</div>"
        >>> extract_text(html, ".item")
        ['Hello', 'World']
"""

from __future__ import annotations

from typing import Any

from bs4 import BeautifulSoup, Tag


[docs] def select(html: str, selector: str) -> list[Tag]: """Select all elements matching a CSS selector. Args: html: Raw HTML string to parse. selector: CSS selector string. Returns: A list of matching :class:`bs4.Tag` objects. Examples: :: >>> html = "<ul><li>A</li><li>B</li></ul>" >>> tags = select(html, "li") >>> len(tags) 2 """ soup = BeautifulSoup(html, "html.parser") return soup.select(selector)
[docs] def select_one(html: str, selector: str) -> Tag | None: """Select the first element matching a CSS selector. Args: html: Raw HTML string to parse. selector: CSS selector string. Returns: The first matching :class:`bs4.Tag`, or ``None`` if not found. Examples: :: >>> html = "<h1>Title</h1><h1>Subtitle</h1>" >>> tag = select_one(html, "h1") >>> tag.get_text() 'Title' """ soup = BeautifulSoup(html, "html.parser") return soup.select_one(selector)
[docs] def extract_text(html: str, selector: str, *, strip: bool = True) -> list[str]: """Extract text content from all elements matching a CSS selector. Args: html: Raw HTML string to parse. selector: CSS selector string. strip: Whether to strip whitespace from each text result. Returns: A list of text strings from matching elements. Examples: :: >>> html = "<p>Hello</p><p>World</p>" >>> extract_text(html, "p") ['Hello', 'World'] """ tags = select(html, selector) return [tag.get_text(strip=strip) for tag in tags]
[docs] def extract_attrs( html: str, selector: str, *, attrs: list[str] | None = None, ) -> list[dict[str, Any]]: """Extract attributes from all elements matching a CSS selector. If ``attrs`` is not specified, all attributes of each element are returned. If ``attrs`` is a list of attribute names, only those attributes are included (with ``None`` for missing attributes). Args: html: Raw HTML string to parse. selector: CSS selector string. attrs: Optional list of attribute names to extract. Returns: A list of dictionaries mapping attribute names to values. Examples: :: >>> html = '<a href="/about">About</a><a href="/home">Home</a>' >>> extract_attrs(html, "a", attrs=["href"]) [{'href': '/about'}, {'href': '/home'}] """ tags = select(html, selector) results: list[dict[str, Any]] = [] for tag in tags: if attrs is None: results.append(dict(tag.attrs)) else: results.append({attr: tag.get(attr) for attr in attrs}) return results
[docs] def extract_table( html: str, selector: str = "table", *, include_headers: bool = True, ) -> list[list[str]]: """Extract data from an HTML table as a list of rows. Parses the first ``<table>`` element matching the selector. If ``include_headers`` is ``True``, the first row will contain header cell (``<th>``) text. Subsequent rows contain data cell (``<td>``) text. Args: html: Raw HTML string to parse. selector: CSS selector targeting the table element. include_headers: Whether to include ``<th>`` cells as the first row. Returns: A list of rows, where each row is a list of cell text strings. Examples: :: >>> html = "<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>" >>> extract_table(html) [['Name'], ['Alice']] """ soup = BeautifulSoup(html, "html.parser") table = soup.select_one(selector) if table is None: return [] rows: list[list[str]] = [] if include_headers: header_row = table.find("tr") if header_row and isinstance(header_row, Tag): headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])] if headers: rows.append(headers) for tr in table.find_all("tr"): if not isinstance(tr, Tag): continue cells = tr.find_all("td") if cells: rows.append([td.get_text(strip=True) for td in cells]) return rows