Source code for pyfetcher.scrape.links

"""Link extraction for :mod:`pyfetcher`.

Purpose:
    Harvest and normalize links from HTML documents, supporting filtering
    by domain, scheme, and link attributes.

Examples:
    ::

        >>> html = '<a href="https://example.com">Example</a>'
        >>> links = extract_links(html, base_url="https://example.com")
        >>> links[0].url
        'https://example.com'
"""

from __future__ import annotations

from dataclasses import dataclass
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup



[docs]
@dataclass(frozen=True, slots=True)
class LinkInfo:
    """Extracted link information.

    Args:
        url: The resolved absolute URL.
        text: The link's visible text content.
        rel: The ``rel`` attribute value, if present.
        is_external: Whether the link points to a different domain.

    Examples:
        ::

            >>> link = LinkInfo(
            ...     url="https://example.com", text="Example",
            ...     rel=None, is_external=False,
            ... )
            >>> link.url
            'https://example.com'
    """

    url: str
    text: str
    rel: str | None
    is_external: bool




[docs]
def extract_links(
    html: str,
    *,
    base_url: str | None = None,
    same_domain_only: bool = False,
    include_fragments: bool = False,
) -> list[LinkInfo]:
    """Extract and normalize links from HTML.

    Parses all ``<a>`` tags with ``href`` attributes and resolves relative
    URLs against ``base_url``. Optionally filters to same-domain links
    only and controls whether fragment-only links are included.

    Args:
        html: Raw HTML string to parse.
        base_url: Base URL for resolving relative hrefs. Required for
            accurate ``is_external`` detection and relative URL resolution.
        same_domain_only: If ``True``, only return links pointing to the
            same domain as ``base_url``.
        include_fragments: If ``True``, include fragment-only links
            (e.g. ``#section``). Defaults to ``False``.

    Returns:
        A list of :class:`LinkInfo` objects for each extracted link.

    Examples:
        ::

            >>> html = '<a href="/about">About</a><a href="https://other.com">Other</a>'
            >>> links = extract_links(html, base_url="https://example.com")
            >>> len(links)
            2
            >>> links = extract_links(html, base_url="https://example.com", same_domain_only=True)
            >>> len(links)
            1
    """
    soup = BeautifulSoup(html, "html.parser")
    base_domain = urlparse(base_url).netloc if base_url else ""
    results: list[LinkInfo] = []

    for anchor in soup.find_all("a", href=True):
        href = anchor["href"].strip()

        if not href or href.startswith(("javascript:", "mailto:", "tel:")):
            continue

        if href.startswith("#") and not include_fragments:
            continue

        resolved = urljoin(base_url or "", href)
        parsed = urlparse(resolved)

        if parsed.scheme not in ("http", "https"):
            continue

        link_domain = parsed.netloc
        is_external = link_domain != base_domain if base_domain else True

        if same_domain_only and is_external:
            continue

        rel_values = anchor.get("rel", [])
        rel_text = " ".join(rel_values) if isinstance(rel_values, list) else str(rel_values)

        results.append(
            LinkInfo(
                url=resolved,
                text=anchor.get_text(strip=True),
                rel=rel_text or None,
                is_external=is_external,
            )
        )

    return results