ledgerdock/backend/app/services/extractor.py

"""Document extraction service for text indexing, previews, and archive fan-out."""

import io
import re
import zipfile
from dataclasses import dataclass, field
from pathlib import Path

import magic
from docx import Document as DocxDocument
from openpyxl import load_workbook
from PIL import Image, ImageOps
from pypdf import PdfReader
import pymupdf

from app.core.config import get_settings
from app.services.handwriting import (
    IMAGE_TEXT_TYPE_NO_TEXT,
    IMAGE_TEXT_TYPE_UNKNOWN,
    HandwritingTranscriptionError,
    HandwritingTranscriptionNotConfiguredError,
    HandwritingTranscriptionTimeoutError,
    classify_image_text_bytes,
    transcribe_handwriting_bytes,
)


settings = get_settings()


IMAGE_EXTENSIONS = {
    ".jpg",
    ".jpeg",
    ".png",
    ".tif",
    ".tiff",
    ".bmp",
    ".gif",
    ".webp",
    ".heic",
}

SUPPORTED_TEXT_EXTENSIONS = {
    ".txt",
    ".md",
    ".csv",
    ".json",
    ".xml",
    ".svg",
    ".pdf",
    ".docx",
    ".xlsx",
    *IMAGE_EXTENSIONS,
}


@dataclass
class ExtractionResult:
    """Represents output generated during extraction for a single file."""

    text: str
    preview_bytes: bytes | None
    preview_suffix: str | None
    status: str
    metadata_json: dict[str, object] = field(default_factory=dict)


@dataclass
class ArchiveMember:
    """Represents an extracted file entry from an archive."""

    name: str
    data: bytes


def sniff_mime(data: bytes) -> str:
    """Detects MIME type using libmagic for robust format handling."""

    return magic.from_buffer(data, mime=True) or "application/octet-stream"


def is_supported_for_extraction(extension: str, mime_type: str) -> bool:
    """Determines if a file should be text-processed for indexing and classification."""

    return extension in SUPPORTED_TEXT_EXTENSIONS or mime_type.startswith("text/")


def _normalize_text(text: str) -> str:
    """Normalizes extracted text by removing repeated form separators and controls."""

    cleaned = text.replace("\r", "\n").replace("\x00", "")
    lines: list[str] = []
    for line in cleaned.split("\n"):
        stripped = line.strip()
        if stripped and re.fullmatch(r"[.\-_*=~\s]{4,}", stripped):
            continue
        lines.append(line)

    normalized = "\n".join(lines)
    normalized = re.sub(r"\n{3,}", "\n\n", normalized)
    return normalized.strip()


def _extract_pdf_text(data: bytes) -> str:
    """Extracts text from PDF bytes using pypdf page parsing."""

    reader = PdfReader(io.BytesIO(data))
    pages: list[str] = []
    for page in reader.pages:
        pages.append(page.extract_text() or "")
    return _normalize_text("\n".join(pages))


def _extract_pdf_preview(data: bytes) -> tuple[bytes | None, str | None]:
    """Creates a JPEG thumbnail preview from the first PDF page."""

    try:
        document = pymupdf.open(stream=data, filetype="pdf")
    except Exception:
        return None, None

    try:
        if document.page_count < 1:
            return None, None
        page = document.load_page(0)
        pixmap = page.get_pixmap(matrix=pymupdf.Matrix(1.5, 1.5), alpha=False)
        return pixmap.tobytes("jpeg"), ".jpg"
    except Exception:
        return None, None
    finally:
        document.close()


def _extract_docx_text(data: bytes) -> str:
    """Extracts paragraph text from DOCX content."""

    document = DocxDocument(io.BytesIO(data))
    return _normalize_text("\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text))


def _extract_xlsx_text(data: bytes) -> str:
    """Extracts cell text from XLSX workbook sheets for indexing."""

    workbook = load_workbook(io.BytesIO(data), data_only=True, read_only=True)
    chunks: list[str] = []
    for sheet in workbook.worksheets:
        chunks.append(sheet.title)
        row_count = 0
        for row in sheet.iter_rows(min_row=1, max_row=200):
            row_values = [str(cell.value) for cell in row if cell.value is not None]
            if row_values:
                chunks.append(" ".join(row_values))
            row_count += 1
            if row_count >= 200:
                break
    return _normalize_text("\n".join(chunks))


def _build_image_preview(data: bytes) -> tuple[bytes | None, str | None]:
    """Builds a JPEG preview thumbnail for image files."""

    try:
        with Image.open(io.BytesIO(data)) as image:
            preview = ImageOps.exif_transpose(image).convert("RGB")
            preview.thumbnail((600, 600))
            output = io.BytesIO()
            preview.save(output, format="JPEG", optimize=True, quality=82)
            return output.getvalue(), ".jpg"
    except Exception:
        return None, None


def _extract_handwriting_text(data: bytes, mime_type: str) -> ExtractionResult:
    """Extracts text from image bytes and records handwriting-vs-printed classification metadata."""

    preview_bytes, preview_suffix = _build_image_preview(data)
    metadata_json: dict[str, object] = {}

    try:
        text_type = classify_image_text_bytes(data, mime_type=mime_type)
        metadata_json = {
            "image_text_type": text_type.label,
            "image_text_type_confidence": text_type.confidence,
            "image_text_type_provider": text_type.provider,
            "image_text_type_model": text_type.model,
        }
    except HandwritingTranscriptionNotConfiguredError as error:
        return ExtractionResult(
            text="",
            preview_bytes=preview_bytes,
            preview_suffix=preview_suffix,
            status="unsupported",
            metadata_json={"transcription_error": str(error), "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN},
        )
    except HandwritingTranscriptionTimeoutError as error:
        metadata_json = {
            "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
            "image_text_type_error": str(error),
        }
    except HandwritingTranscriptionError as error:
        metadata_json = {
            "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
            "image_text_type_error": str(error),
        }

    if metadata_json.get("image_text_type") == IMAGE_TEXT_TYPE_NO_TEXT:
        metadata_json["transcription_skipped"] = "no_text_detected"
        return ExtractionResult(
            text="",
            preview_bytes=preview_bytes,
            preview_suffix=preview_suffix,
            status="processed",
            metadata_json=metadata_json,
        )

    try:
        transcription = transcribe_handwriting_bytes(data, mime_type=mime_type)
        transcription_metadata: dict[str, object] = {
            "transcription_provider": transcription.provider,
            "transcription_model": transcription.model,
            "transcription_uncertainties": transcription.uncertainties,
        }
        return ExtractionResult(
            text=_normalize_text(transcription.text),
            preview_bytes=preview_bytes,
            preview_suffix=preview_suffix,
            status="processed",
            metadata_json={**metadata_json, **transcription_metadata},
        )
    except HandwritingTranscriptionNotConfiguredError as error:
        return ExtractionResult(
            text="",
            preview_bytes=preview_bytes,
            preview_suffix=preview_suffix,
            status="unsupported",
            metadata_json={**metadata_json, "transcription_error": str(error)},
        )
    except HandwritingTranscriptionTimeoutError as error:
        return ExtractionResult(
            text="",
            preview_bytes=preview_bytes,
            preview_suffix=preview_suffix,
            status="error",
            metadata_json={**metadata_json, "transcription_error": str(error)},
        )
    except HandwritingTranscriptionError as error:
        return ExtractionResult(
            text="",
            preview_bytes=preview_bytes,
            preview_suffix=preview_suffix,
            status="error",
            metadata_json={**metadata_json, "transcription_error": str(error)},
        )


def extract_text_content(filename: str, data: bytes, mime_type: str) -> ExtractionResult:
    """Extracts text and optional preview bytes for supported file types."""

    extension = Path(filename).suffix.lower()
    text = ""
    preview_bytes: bytes | None = None
    preview_suffix: str | None = None

    try:
        if extension == ".pdf":
            text = _extract_pdf_text(data)
            preview_bytes, preview_suffix = _extract_pdf_preview(data)
        elif extension in {".txt", ".md", ".csv", ".json", ".xml", ".svg"} or mime_type.startswith("text/"):
            text = _normalize_text(data.decode("utf-8", errors="ignore"))
        elif extension == ".docx":
            text = _extract_docx_text(data)
        elif extension == ".xlsx":
            text = _extract_xlsx_text(data)
        elif extension in IMAGE_EXTENSIONS:
            return _extract_handwriting_text(data=data, mime_type=mime_type)
        else:
            return ExtractionResult(
                text="",
                preview_bytes=None,
                preview_suffix=None,
                status="unsupported",
                metadata_json={"reason": "unsupported_format"},
            )
    except Exception as error:
        return ExtractionResult(
            text="",
            preview_bytes=None,
            preview_suffix=None,
            status="error",
            metadata_json={"reason": "extraction_exception", "error": str(error)},
        )

    return ExtractionResult(
        text=text[: settings.max_text_length],
        preview_bytes=preview_bytes,
        preview_suffix=preview_suffix,
        status="processed",
        metadata_json={},
    )


def extract_archive_members(data: bytes, depth: int = 0, max_members: int | None = None) -> list[ArchiveMember]:
    """Extracts processable ZIP members with depth-aware and decompression safety guardrails."""

    members: list[ArchiveMember] = []
    normalized_depth = max(0, depth)
    if normalized_depth >= settings.max_zip_depth:
        return members

    member_limit = settings.max_zip_members
    if max_members is not None:
        member_limit = max(0, min(settings.max_zip_members, int(max_members)))
    if member_limit <= 0:
        return members

    total_uncompressed_bytes = 0
    try:
        with zipfile.ZipFile(io.BytesIO(data)) as archive:
            infos = [info for info in archive.infolist() if not info.is_dir()][:member_limit]
            for info in infos:
                if info.file_size <= 0:
                    continue
                if info.file_size > settings.max_zip_member_uncompressed_bytes:
                    continue
                if total_uncompressed_bytes + info.file_size > settings.max_zip_total_uncompressed_bytes:
                    continue

                compressed_size = max(1, int(info.compress_size))
                compression_ratio = float(info.file_size) / float(compressed_size)
                if compression_ratio > settings.max_zip_compression_ratio:
                    continue

                with archive.open(info, mode="r") as archive_member:
                    member_data = archive_member.read(settings.max_zip_member_uncompressed_bytes + 1)
                if len(member_data) > settings.max_zip_member_uncompressed_bytes:
                    continue
                if total_uncompressed_bytes + len(member_data) > settings.max_zip_total_uncompressed_bytes:
                    continue

                total_uncompressed_bytes += len(member_data)
                members.append(ArchiveMember(name=info.filename, data=member_data))
    except zipfile.BadZipFile:
        return []

    return members