Source code for pyfetcher.scrape.sitemap

"""Sitemap parser for :mod:`pyfetcher`.

Purpose:
    Parse XML sitemaps (both sitemap index files and URL set files) and
    extract URL entries with their metadata.

Examples:
    ::

        >>> xml = '<?xml version="1.0"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>https://example.com/</loc></url></urlset>'
        >>> entries = parse_sitemap(xml)
        >>> entries[0].loc
        'https://example.com/'
"""

from __future__ import annotations

from dataclasses import dataclass
from xml.etree import ElementTree  # nosec B405

from defusedxml.ElementTree import fromstring as _safe_fromstring

SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9"


[docs] @dataclass(frozen=True, slots=True) class SitemapEntry: """A single URL entry from a sitemap. Args: loc: The URL location. lastmod: The last modification date string, if present. changefreq: The change frequency hint, if present. priority: The priority value as a string, if present. is_sitemap: Whether this entry is a sitemap index reference. Examples: :: >>> entry = SitemapEntry(loc="https://example.com/") >>> entry.loc 'https://example.com/' """ loc: str lastmod: str | None = None changefreq: str | None = None priority: str | None = None is_sitemap: bool = False
[docs] def parse_sitemap(xml_content: str) -> list[SitemapEntry]: """Parse an XML sitemap or sitemap index. Handles both ``<urlset>`` (URL sitemaps) and ``<sitemapindex>`` (sitemap index files). Returns a flat list of entries with the ``is_sitemap`` flag set for index entries. Args: xml_content: Raw XML string content of the sitemap. Returns: A list of :class:`SitemapEntry` objects. Examples: :: >>> xml = ( ... '<?xml version="1.0"?>' ... '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' ... '<url><loc>https://example.com/</loc><priority>1.0</priority></url>' ... '</urlset>' ... ) >>> entries = parse_sitemap(xml) >>> entries[0].priority '1.0' """ root = _safe_fromstring(xml_content) entries: list[SitemapEntry] = [] # Handle <urlset> (standard sitemap) for url_elem in root.findall(f"{{{SITEMAP_NS}}}url"): loc_elem = url_elem.find(f"{{{SITEMAP_NS}}}loc") if loc_elem is None or not loc_elem.text: continue entries.append( SitemapEntry( loc=loc_elem.text.strip(), lastmod=_get_text(url_elem, "lastmod"), changefreq=_get_text(url_elem, "changefreq"), priority=_get_text(url_elem, "priority"), is_sitemap=False, ) ) # Handle <sitemapindex> (sitemap index) for sitemap_elem in root.findall(f"{{{SITEMAP_NS}}}sitemap"): loc_elem = sitemap_elem.find(f"{{{SITEMAP_NS}}}loc") if loc_elem is None or not loc_elem.text: continue entries.append( SitemapEntry( loc=loc_elem.text.strip(), lastmod=_get_text(sitemap_elem, "lastmod"), is_sitemap=True, ) ) return entries
def _get_text(parent: ElementTree.Element, tag: str) -> str | None: """Get text content of a child element. Args: parent: The parent XML element. tag: The child tag name (without namespace prefix). Returns: The stripped text content, or ``None`` if the element doesn't exist. """ elem = parent.find(f"{{{SITEMAP_NS}}}{tag}") return elem.text.strip() if elem is not None and elem.text else None