Source code for pyfetcher.extractors.convert
"""HTML conversion utilities for :mod:`pyfetcher.extractors`.
Purpose:
Convert HTML to markdown or plaintext using html2text and markdownify.
"""
from __future__ import annotations
[docs]
def html_to_markdown(html: str) -> str:
"""Convert HTML to Markdown using markdownify.
Args:
html: Raw HTML string.
Returns:
Markdown-formatted text.
"""
from markdownify import markdownify # type: ignore[import-untyped]
return markdownify(html, heading_style="ATX", strip=["script", "style"])
[docs]
def html_to_plaintext(html: str) -> str:
"""Convert HTML to plaintext using html2text.
Args:
html: Raw HTML string.
Returns:
Plaintext with basic formatting preserved.
"""
import html2text # type: ignore[import-untyped]
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = True
converter.body_width = 0
return converter.handle(html)