Source code for pyfetcher.metadata.html

"""Basic HTML metadata extraction for :mod:`pyfetcher`.

Purpose:
    Provide lightweight HTML parsing for titles, descriptions, canonical links,
    and icon links using BeautifulSoup.

Design:
    - Parsing uses :mod:`bs4` for readability and robustness.
    - This module intentionally handles only the most common HTML-level fields.
    - Open Graph extraction is delegated to :mod:`pyfetcher.metadata.opengraph`.

Examples:
    ::

        >>> html = "<html><head><title>Example</title></head></html>"
        >>> extract_basic_html_metadata(html).title
        'Example'
"""

from __future__ import annotations

from urllib.parse import urljoin

from bs4 import BeautifulSoup

from pyfetcher.metadata.models import FaviconLink, PageMetadata



[docs]
def extract_basic_html_metadata(html: str, *, base_url: str | None = None) -> PageMetadata:
    """Extract basic HTML page metadata.

    Parses the ``<title>``, ``<meta name="description">``,
    ``<link rel="canonical">``, and favicon ``<link>`` elements from the
    given HTML string. Relative URLs are resolved against ``base_url``
    when provided.

    Args:
        html: Raw HTML string to parse.
        base_url: Optional base URL for resolving relative link hrefs.

    Returns:
        A :class:`~pyfetcher.metadata.models.PageMetadata` populated with
        the extracted fields.

    Examples:
        ::

            >>> html = (
            ...     "<html><head><title>Example</title>"
            ...     "<meta name='description' content='Desc' />"
            ...     "<link rel='icon' href='/favicon.ico' />"
            ...     "</head></html>"
            ... )
            >>> meta = extract_basic_html_metadata(html, base_url="https://example.com")
            >>> meta.title
            'Example'
    """
    soup = BeautifulSoup(html, "html.parser")
    title = soup.title.string.strip() if soup.title and soup.title.string else None

    description_tag = soup.find("meta", attrs={"name": "description"})
    description = (
        description_tag.get("content", "").strip()
        if description_tag and description_tag.get("content")
        else None
    )

    canonical_url: str | None = None
    for tag in soup.find_all("link"):
        rel_values = tag.get("rel", [])
        rel_text = (
            " ".join(rel_values).lower()
            if isinstance(rel_values, list)
            else str(rel_values).lower()
        )
        if "canonical" in rel_text and tag.get("href"):
            canonical_url = urljoin(base_url or "", tag["href"])
            break

    favicons: list[FaviconLink] = []
    for tag in soup.find_all("link"):
        rel_values = tag.get("rel", [])
        rel_text = (
            " ".join(rel_values).lower()
            if isinstance(rel_values, list)
            else str(rel_values).lower()
        )
        if any(token in rel_text for token in ("icon", "apple-touch-icon", "mask-icon")):
            href = tag.get("href")
            if not href:
                continue
            favicons.append(
                FaviconLink(
                    href=urljoin(base_url or "", href),
                    rel=rel_text,
                    sizes=tag.get("sizes"),
                    mime_type=tag.get("type"),
                )
            )

    return PageMetadata(
        title=title,
        description=description,
        canonical_url=canonical_url,
        favicons=favicons,
    )