Source code for pyfetcher.metadata.extruct

"""Structured metadata extraction for :mod:`pyfetcher`.

Purpose:
    Run :mod:`extruct` against HTML and combine it with lighter HTML/Open Graph
    parsing helpers for comprehensive metadata extraction.

Design:
    - ``extruct`` is imported lazily so users can keep it optional.
    - Relative URLs are resolved with ``w3lib.html.get_base_url``.
    - The output is normalized into :class:`~pyfetcher.metadata.models.PageMetadata`.

Examples:
    ::

        >>> html = "<html><head><title>Example</title></head></html>"
        >>> meta = extract_extruct_metadata(html, page_url="https://example.com")
        >>> meta.title
        'Example'
"""

from __future__ import annotations

from pyfetcher.metadata.html import extract_basic_html_metadata
from pyfetcher.metadata.models import PageMetadata
from pyfetcher.metadata.opengraph import extract_open_graph_metadata


[docs] def extract_extruct_metadata(html: str, *, page_url: str) -> PageMetadata: """Extract combined page metadata using ``extruct`` plus HTML fallbacks. Runs basic HTML metadata extraction and Open Graph parsing, then augments the result with structured data (JSON-LD, microdata, microformat, RDFa, Dublin Core, Open Graph) via ``extruct``. Args: html: Raw HTML string to parse. page_url: Page URL used as the base for resolving relative URLs. Returns: A :class:`~pyfetcher.metadata.models.PageMetadata` with all available metadata fields populated. Raises: ImportError: If ``extruct`` or ``w3lib`` is not installed. Examples: :: >>> meta = extract_extruct_metadata( ... "<html><head><title>Example</title></head></html>", ... page_url="https://example.com", ... ) >>> meta.title 'Example' """ import extruct from w3lib.html import get_base_url base_url = get_base_url(html, page_url) basic = extract_basic_html_metadata(html, base_url=base_url) open_graph = extract_open_graph_metadata(html) structured = extruct.extract( html, base_url=base_url, syntaxes=["json-ld", "opengraph", "microdata", "microformat", "rdfa", "dublincore"], ) return basic.model_copy(update={"open_graph": open_graph, "structured": structured})