Source code for pyfetcher.scrape.links

"""Link extraction for :mod:`pyfetcher`.

Purpose:
    Harvest and normalize links from HTML documents, supporting filtering
    by domain, scheme, and link attributes.

Examples:
    ::

        >>> html = '<a href="https://example.com">Example</a>'
        >>> links = extract_links(html, base_url="https://example.com")
        >>> links[0].url
        'https://example.com'
"""

from __future__ import annotations

from dataclasses import dataclass
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup


[docs] @dataclass(frozen=True, slots=True) class LinkInfo: """Extracted link information. Args: url: The resolved absolute URL. text: The link's visible text content. rel: The ``rel`` attribute value, if present. is_external: Whether the link points to a different domain. Examples: :: >>> link = LinkInfo( ... url="https://example.com", text="Example", ... rel=None, is_external=False, ... ) >>> link.url 'https://example.com' """ url: str text: str rel: str | None is_external: bool