Source code for pyfetcher.scrape.robots
r"""Robots.txt parser for :mod:`pyfetcher`.
Purpose:
Parse ``robots.txt`` files and check URL access permissions for a given
user-agent. Supports ``Allow``, ``Disallow``, ``Crawl-delay``, and
``Sitemap`` directives.
Examples:
::
>>> txt = "User-agent: *\\nDisallow: /admin"
>>> rules = parse_robots_txt(txt)
>>> is_allowed(rules, "/admin", user_agent="*")
False
"""
from __future__ import annotations
from dataclasses import dataclass, field
[docs]
@dataclass
class RobotsRules:
"""Parsed robots.txt rules.
Args:
rules: Mapping of user-agent patterns to lists of (allow, path) tuples.
sitemaps: List of sitemap URLs found in the robots.txt.
crawl_delays: Mapping of user-agent patterns to crawl delay seconds.
Examples:
::
>>> rules = RobotsRules()
>>> rules.sitemaps
[]
"""
rules: dict[str, list[tuple[bool, str]]] = field(default_factory=dict)
sitemaps: list[str] = field(default_factory=list)
crawl_delays: dict[str, float] = field(default_factory=dict)
[docs]
def parse_robots_txt(content: str) -> RobotsRules:
r"""Parse a robots.txt file content.
Extracts ``User-agent``, ``Allow``, ``Disallow``, ``Crawl-delay``,
and ``Sitemap`` directives into a structured :class:`RobotsRules`
object.
Args:
content: The raw text content of a robots.txt file.
Returns:
A :class:`RobotsRules` object containing parsed directives.
Examples:
::
>>> txt = "User-agent: *\\nDisallow: /secret\\nAllow: /public"
>>> rules = parse_robots_txt(txt)
>>> len(rules.rules.get("*", []))
2
"""
result = RobotsRules()
current_agents: list[str] = []
for raw_line in content.splitlines():
line = raw_line.split("#", 1)[0].strip()
if not line:
continue
if ":" not in line:
continue
directive, _, value = line.partition(":")
directive = directive.strip().lower()
value = value.strip()
if directive == "user-agent":
current_agents = [value.lower()]
for agent in current_agents:
if agent not in result.rules:
result.rules[agent] = []
elif directive == "disallow" and current_agents:
for agent in current_agents:
if agent not in result.rules:
result.rules[agent] = []
if value:
result.rules[agent].append((False, value))
elif directive == "allow" and current_agents:
for agent in current_agents:
if agent not in result.rules:
result.rules[agent] = []
result.rules[agent].append((True, value))
elif directive == "crawl-delay" and current_agents:
try:
delay = float(value)
for agent in current_agents:
result.crawl_delays[agent] = delay
except ValueError:
pass
elif directive == "sitemap":
result.sitemaps.append(value)
return result
[docs]
def is_allowed(
rules: RobotsRules,
path: str,
*,
user_agent: str = "*",
) -> bool:
r"""Check if a path is allowed for the given user-agent.
Evaluates the parsed robots.txt rules for the most specific matching
user-agent. ``Allow`` directives take precedence over ``Disallow``
when paths have equal specificity (longer path prefix wins).
Args:
rules: Parsed robots.txt rules from :func:`parse_robots_txt`.
path: The URL path to check (e.g. ``'/admin/settings'``).
user_agent: The user-agent string to check against. Defaults to
``'*'`` (wildcard).
Returns:
``True`` if the path is allowed, ``False`` if disallowed.
Examples:
::
>>> txt = "User-agent: *\\nDisallow: /admin\\nAllow: /admin/public"
>>> rules = parse_robots_txt(txt)
>>> is_allowed(rules, "/admin/settings")
False
>>> is_allowed(rules, "/admin/public")
True
"""
agent_key = user_agent.lower()
agent_rules = rules.rules.get(agent_key) or rules.rules.get("*", [])
if not agent_rules:
return True
best_match: tuple[bool, int] | None = None
for allowed, rule_path in agent_rules:
if path.startswith(rule_path) or rule_path == "":
specificity = len(rule_path)
is_better = (
best_match is None
or specificity > best_match[1]
or (specificity == best_match[1] and allowed)
)
if is_better:
best_match = (allowed, specificity)
if best_match is None:
return True
return best_match[0]