Source code for pyfetcher.extractors.media_meta

"""Media file metadata extraction for :mod:`pyfetcher.extractors`.

Purpose:
    Extract metadata from media files: audio (mutagen), video (pymediainfo),
    images (exifread), and PDFs (pypdf). Returns a unified dict.
"""

from __future__ import annotations

from pathlib import Path
from typing import Any



[docs]
def extract_media_metadata(file_path: str | Path) -> dict[str, Any]:
    """Extract metadata from a media file based on its type.

    Dispatches to the appropriate library based on file extension.

    Args:
        file_path: Path to the media file.

    Returns:
        A dictionary of extracted metadata.
    """
    path = Path(file_path)
    ext = path.suffix.lower()

    if ext in {".mp3", ".flac", ".ogg", ".m4a", ".wma", ".wav", ".aac"}:
        return _extract_audio(path)
    elif ext in {".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv"}:
        return _extract_video(path)
    elif ext in {".jpg", ".jpeg", ".png", ".gif", ".tiff", ".bmp", ".webp"}:
        return _extract_image(path)
    elif ext == ".pdf":
        return _extract_pdf(path)
    return {"type": "unknown", "extension": ext}



def _extract_audio(path: Path) -> dict[str, Any]:
    """Extract audio metadata with mutagen."""
    try:
        import mutagen  # type: ignore[import-untyped]

        audio = mutagen.File(str(path))
        if audio is None:
            return {"type": "audio", "error": "unrecognized format"}
        info: dict[str, Any] = {"type": "audio"}
        if hasattr(audio, "info"):
            info["length_seconds"] = getattr(audio.info, "length", None)
            info["bitrate"] = getattr(audio.info, "bitrate", None)
            info["sample_rate"] = getattr(audio.info, "sample_rate", None)
            info["channels"] = getattr(audio.info, "channels", None)
        if hasattr(audio, "tags") and audio.tags:
            info["tags"] = {str(k): str(v) for k, v in audio.tags.items()}
        return info
    except Exception as e:
        return {"type": "audio", "error": str(e)}


def _extract_video(path: Path) -> dict[str, Any]:
    """Extract video metadata with pymediainfo."""
    try:
        from pymediainfo import MediaInfo  # type: ignore[import-untyped]

        mi = MediaInfo.parse(str(path))
        info: dict[str, Any] = {"type": "video", "tracks": []}
        for track in mi.tracks:
            info["tracks"].append({k: v for k, v in track.to_data().items() if v is not None})
        return info
    except Exception as e:
        return {"type": "video", "error": str(e)}


def _extract_image(path: Path) -> dict[str, Any]:
    """Extract image EXIF metadata with exifread."""
    try:
        import exifread  # type: ignore[import-untyped]

        with path.open("rb") as f:
            tags = exifread.process_file(f, details=False)
        return {"type": "image", "exif": {str(k): str(v) for k, v in tags.items()}}
    except Exception as e:
        return {"type": "image", "error": str(e)}


def _extract_pdf(path: Path) -> dict[str, Any]:
    """Extract PDF metadata with pypdf."""
    try:
        from pypdf import PdfReader  # type: ignore[import-untyped]

        reader = PdfReader(str(path))
        meta = reader.metadata
        info: dict[str, Any] = {
            "type": "pdf",
            "pages": len(reader.pages),
        }
        if meta:
            info["title"] = meta.get("/Title")
            info["author"] = meta.get("/Author")
            info["subject"] = meta.get("/Subject")
            info["creator"] = meta.get("/Creator")
        return info
    except Exception as e:
        return {"type": "pdf", "error": str(e)}