Source code for pyfetcher.metadata.html
"""Basic HTML metadata extraction for :mod:`pyfetcher`.
Purpose:
Provide lightweight HTML parsing for titles, descriptions, canonical links,
and icon links using BeautifulSoup.
Design:
- Parsing uses :mod:`bs4` for readability and robustness.
- This module intentionally handles only the most common HTML-level fields.
- Open Graph extraction is delegated to :mod:`pyfetcher.metadata.opengraph`.
Examples:
::
>>> html = "<html><head><title>Example</title></head></html>"
>>> extract_basic_html_metadata(html).title
'Example'
"""
from __future__ import annotations
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from pyfetcher.metadata.models import FaviconLink, PageMetadata
[docs]
def extract_basic_html_metadata(html: str, *, base_url: str | None = None) -> PageMetadata:
"""Extract basic HTML page metadata.
Parses the ``<title>``, ``<meta name="description">``,
``<link rel="canonical">``, and favicon ``<link>`` elements from the
given HTML string. Relative URLs are resolved against ``base_url``
when provided.
Args:
html: Raw HTML string to parse.
base_url: Optional base URL for resolving relative link hrefs.
Returns:
A :class:`~pyfetcher.metadata.models.PageMetadata` populated with
the extracted fields.
Examples:
::
>>> html = (
... "<html><head><title>Example</title>"
... "<meta name='description' content='Desc' />"
... "<link rel='icon' href='/favicon.ico' />"
... "</head></html>"
... )
>>> meta = extract_basic_html_metadata(html, base_url="https://example.com")
>>> meta.title
'Example'
"""
soup = BeautifulSoup(html, "html.parser")
title = soup.title.string.strip() if soup.title and soup.title.string else None
description_tag = soup.find("meta", attrs={"name": "description"})
description = (
description_tag.get("content", "").strip()
if description_tag and description_tag.get("content")
else None
)
canonical_url: str | None = None
for tag in soup.find_all("link"):
rel_values = tag.get("rel", [])
rel_text = (
" ".join(rel_values).lower()
if isinstance(rel_values, list)
else str(rel_values).lower()
)
if "canonical" in rel_text and tag.get("href"):
canonical_url = urljoin(base_url or "", tag["href"])
break
favicons: list[FaviconLink] = []
for tag in soup.find_all("link"):
rel_values = tag.get("rel", [])
rel_text = (
" ".join(rel_values).lower()
if isinstance(rel_values, list)
else str(rel_values).lower()
)
if any(token in rel_text for token in ("icon", "apple-touch-icon", "mask-icon")):
href = tag.get("href")
if not href:
continue
favicons.append(
FaviconLink(
href=urljoin(base_url or "", href),
rel=rel_text,
sizes=tag.get("sizes"),
mime_type=tag.get("type"),
)
)
return PageMetadata(
title=title,
description=description,
canonical_url=canonical_url,
favicons=favicons,
)