Source code for pyfetcher.scrape.sitemap

"""Sitemap parser for :mod:`pyfetcher`.

Purpose:
    Parse XML sitemaps (both sitemap index files and URL set files) and
    extract URL entries with their metadata.

Examples:
    ::

        >>> xml = '<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>https://example.com/</loc></url></urlset>'
        >>> entries = parse_sitemap(xml)
        >>> entries[0].loc
        'https://example.com/'
"""

from __future__ import annotations

from dataclasses import dataclass
from xml.etree import ElementTree  # nosec B405

from defusedxml.ElementTree import fromstring as _safe_fromstring

SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"



[docs]
@dataclass(frozen=True, slots=True)
class SitemapEntry:
    """A single URL entry from a sitemap.

    Args:
        loc: The URL location.
        lastmod: The last modification date string, if present.
        changefreq: The change frequency hint, if present.
        priority: The priority value as a string, if present.
        is_sitemap: Whether this entry is a sitemap index reference.

    Examples:
        ::

            >>> entry = SitemapEntry(loc="https://example.com/")
            >>> entry.loc
            'https://example.com/'
    """

    loc: str
    lastmod: str | None = None
    changefreq: str | None = None
    priority: str | None = None
    is_sitemap: bool = False




[docs]
def parse_sitemap(xml_content: str) -> list[SitemapEntry]:
    """Parse an XML sitemap or sitemap index.

    Handles both ``<urlset>`` (URL sitemaps) and ``<sitemapindex>``
    (sitemap index files). Returns a flat list of entries with the
    ``is_sitemap`` flag set for index entries.

    Args:
        xml_content: Raw XML string content of the sitemap.

    Returns:
        A list of :class:`SitemapEntry` objects.

    Examples:
        ::

            >>> xml = (
            ...     '<?xml version="1.0"?>'
            ...     '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
            ...     '<url><loc>https://example.com/</loc><priority>1.0</priority></url>'
            ...     '</urlset>'
            ... )
            >>> entries = parse_sitemap(xml)
            >>> entries[0].priority
            '1.0'
    """
    root = _safe_fromstring(xml_content)
    entries: list[SitemapEntry] = []

    # Handle <urlset> (standard sitemap)
    for url_elem in root.findall(f"{{{SITEMAP_NS}}}url"):
        loc_elem = url_elem.find(f"{{{SITEMAP_NS}}}loc")
        if loc_elem is None or not loc_elem.text:
            continue
        entries.append(
            SitemapEntry(
                loc=loc_elem.text.strip(),
                lastmod=_get_text(url_elem, "lastmod"),
                changefreq=_get_text(url_elem, "changefreq"),
                priority=_get_text(url_elem, "priority"),
                is_sitemap=False,
            )
        )

    # Handle <sitemapindex> (sitemap index)
    for sitemap_elem in root.findall(f"{{{SITEMAP_NS}}}sitemap"):
        loc_elem = sitemap_elem.find(f"{{{SITEMAP_NS}}}loc")
        if loc_elem is None or not loc_elem.text:
            continue
        entries.append(
            SitemapEntry(
                loc=loc_elem.text.strip(),
                lastmod=_get_text(sitemap_elem, "lastmod"),
                is_sitemap=True,
            )
        )

    return entries



def _get_text(parent: ElementTree.Element, tag: str) -> str | None:
    """Get text content of a child element.

    Args:
        parent: The parent XML element.
        tag: The child tag name (without namespace prefix).

    Returns:
        The stripped text content, or ``None`` if the element doesn't exist.
    """
    elem = parent.find(f"{{{SITEMAP_NS}}}{tag}")
    return elem.text.strip() if elem is not None and elem.text else None