Initial commit

2026-02-21 09:44:18 -03:00
commit 5dfc2cbd85
65 changed files with 11989 additions and 0 deletions
--- a/backend/app/services/extractor.py
+++ b/backend/app/services/extractor.py
@@ -0,0 +1,315 @@
+"""Document extraction service for text indexing, previews, and archive fan-out."""
+
+import io
+import re
+import zipfile
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import magic
+from docx import Document as DocxDocument
+from openpyxl import load_workbook
+from PIL import Image, ImageOps
+from pypdf import PdfReader
+import pymupdf
+
+from app.core.config import get_settings
+from app.services.handwriting import (
+    IMAGE_TEXT_TYPE_NO_TEXT,
+    IMAGE_TEXT_TYPE_UNKNOWN,
+    HandwritingTranscriptionError,
+    HandwritingTranscriptionNotConfiguredError,
+    HandwritingTranscriptionTimeoutError,
+    classify_image_text_bytes,
+    transcribe_handwriting_bytes,
+)
+
+
+settings = get_settings()
+
+
+IMAGE_EXTENSIONS = {
+    ".jpg",
+    ".jpeg",
+    ".png",
+    ".tif",
+    ".tiff",
+    ".bmp",
+    ".gif",
+    ".webp",
+    ".heic",
+}
+
+SUPPORTED_TEXT_EXTENSIONS = {
+    ".txt",
+    ".md",
+    ".csv",
+    ".json",
+    ".xml",
+    ".svg",
+    ".pdf",
+    ".docx",
+    ".xlsx",
+    *IMAGE_EXTENSIONS,
+}
+
+
+@dataclass
+class ExtractionResult:
+    """Represents output generated during extraction for a single file."""
+
+    text: str
+    preview_bytes: bytes | None
+    preview_suffix: str | None
+    status: str
+    metadata_json: dict[str, object] = field(default_factory=dict)
+
+
+@dataclass
+class ArchiveMember:
+    """Represents an extracted file entry from an archive."""
+
+    name: str
+    data: bytes
+
+
+def sniff_mime(data: bytes) -> str:
+    """Detects MIME type using libmagic for robust format handling."""
+
+    return magic.from_buffer(data, mime=True) or "application/octet-stream"
+
+
+def is_supported_for_extraction(extension: str, mime_type: str) -> bool:
+    """Determines if a file should be text-processed for indexing and classification."""
+
+    return extension in SUPPORTED_TEXT_EXTENSIONS or mime_type.startswith("text/")
+
+
+def _normalize_text(text: str) -> str:
+    """Normalizes extracted text by removing repeated form separators and controls."""
+
+    cleaned = text.replace("\r", "\n").replace("\x00", "")
+    lines: list[str] = []
+    for line in cleaned.split("\n"):
+        stripped = line.strip()
+        if stripped and re.fullmatch(r"[.\-_*=~\s]{4,}", stripped):
+            continue
+        lines.append(line)
+
+    normalized = "\n".join(lines)
+    normalized = re.sub(r"\n{3,}", "\n\n", normalized)
+    return normalized.strip()
+
+
+def _extract_pdf_text(data: bytes) -> str:
+    """Extracts text from PDF bytes using pypdf page parsing."""
+
+    reader = PdfReader(io.BytesIO(data))
+    pages: list[str] = []
+    for page in reader.pages:
+        pages.append(page.extract_text() or "")
+    return _normalize_text("\n".join(pages))
+
+
+def _extract_pdf_preview(data: bytes) -> tuple[bytes | None, str | None]:
+    """Creates a JPEG thumbnail preview from the first PDF page."""
+
+    try:
+        document = pymupdf.open(stream=data, filetype="pdf")
+    except Exception:
+        return None, None
+
+    try:
+        if document.page_count < 1:
+            return None, None
+        page = document.load_page(0)
+        pixmap = page.get_pixmap(matrix=pymupdf.Matrix(1.5, 1.5), alpha=False)
+        return pixmap.tobytes("jpeg"), ".jpg"
+    except Exception:
+        return None, None
+    finally:
+        document.close()
+
+
+def _extract_docx_text(data: bytes) -> str:
+    """Extracts paragraph text from DOCX content."""
+
+    document = DocxDocument(io.BytesIO(data))
+    return _normalize_text("\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text))
+
+
+def _extract_xlsx_text(data: bytes) -> str:
+    """Extracts cell text from XLSX workbook sheets for indexing."""
+
+    workbook = load_workbook(io.BytesIO(data), data_only=True, read_only=True)
+    chunks: list[str] = []
+    for sheet in workbook.worksheets:
+        chunks.append(sheet.title)
+        row_count = 0
+        for row in sheet.iter_rows(min_row=1, max_row=200):
+            row_values = [str(cell.value) for cell in row if cell.value is not None]
+            if row_values:
+                chunks.append(" ".join(row_values))
+            row_count += 1
+            if row_count >= 200:
+                break
+    return _normalize_text("\n".join(chunks))
+
+
+def _build_image_preview(data: bytes) -> tuple[bytes | None, str | None]:
+    """Builds a JPEG preview thumbnail for image files."""
+
+    try:
+        with Image.open(io.BytesIO(data)) as image:
+            preview = ImageOps.exif_transpose(image).convert("RGB")
+            preview.thumbnail((600, 600))
+            output = io.BytesIO()
+            preview.save(output, format="JPEG", optimize=True, quality=82)
+            return output.getvalue(), ".jpg"
+    except Exception:
+        return None, None
+
+
+def _extract_handwriting_text(data: bytes, mime_type: str) -> ExtractionResult:
+    """Extracts text from image bytes and records handwriting-vs-printed classification metadata."""
+
+    preview_bytes, preview_suffix = _build_image_preview(data)
+    metadata_json: dict[str, object] = {}
+
+    try:
+        text_type = classify_image_text_bytes(data, mime_type=mime_type)
+        metadata_json = {
+            "image_text_type": text_type.label,
+            "image_text_type_confidence": text_type.confidence,
+            "image_text_type_provider": text_type.provider,
+            "image_text_type_model": text_type.model,
+        }
+    except HandwritingTranscriptionNotConfiguredError as error:
+        return ExtractionResult(
+            text="",
+            preview_bytes=preview_bytes,
+            preview_suffix=preview_suffix,
+            status="unsupported",
+            metadata_json={"transcription_error": str(error), "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN},
+        )
+    except HandwritingTranscriptionTimeoutError as error:
+        metadata_json = {
+            "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
+            "image_text_type_error": str(error),
+        }
+    except HandwritingTranscriptionError as error:
+        metadata_json = {
+            "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
+            "image_text_type_error": str(error),
+        }
+
+    if metadata_json.get("image_text_type") == IMAGE_TEXT_TYPE_NO_TEXT:
+        metadata_json["transcription_skipped"] = "no_text_detected"
+        return ExtractionResult(
+            text="",
+            preview_bytes=preview_bytes,
+            preview_suffix=preview_suffix,
+            status="processed",
+            metadata_json=metadata_json,
+        )
+
+    try:
+        transcription = transcribe_handwriting_bytes(data, mime_type=mime_type)
+        transcription_metadata: dict[str, object] = {
+            "transcription_provider": transcription.provider,
+            "transcription_model": transcription.model,
+            "transcription_uncertainties": transcription.uncertainties,
+        }
+        return ExtractionResult(
+            text=_normalize_text(transcription.text),
+            preview_bytes=preview_bytes,
+            preview_suffix=preview_suffix,
+            status="processed",
+            metadata_json={**metadata_json, **transcription_metadata},
+        )
+    except HandwritingTranscriptionNotConfiguredError as error:
+        return ExtractionResult(
+            text="",
+            preview_bytes=preview_bytes,
+            preview_suffix=preview_suffix,
+            status="unsupported",
+            metadata_json={**metadata_json, "transcription_error": str(error)},
+        )
+    except HandwritingTranscriptionTimeoutError as error:
+        return ExtractionResult(
+            text="",
+            preview_bytes=preview_bytes,
+            preview_suffix=preview_suffix,
+            status="error",
+            metadata_json={**metadata_json, "transcription_error": str(error)},
+        )
+    except HandwritingTranscriptionError as error:
+        return ExtractionResult(
+            text="",
+            preview_bytes=preview_bytes,
+            preview_suffix=preview_suffix,
+            status="error",
+            metadata_json={**metadata_json, "transcription_error": str(error)},
+        )
+
+
+def extract_text_content(filename: str, data: bytes, mime_type: str) -> ExtractionResult:
+    """Extracts text and optional preview bytes for supported file types."""
+
+    extension = Path(filename).suffix.lower()
+    text = ""
+    preview_bytes: bytes | None = None
+    preview_suffix: str | None = None
+
+    try:
+        if extension == ".pdf":
+            text = _extract_pdf_text(data)
+            preview_bytes, preview_suffix = _extract_pdf_preview(data)
+        elif extension in {".txt", ".md", ".csv", ".json", ".xml", ".svg"} or mime_type.startswith("text/"):
+            text = _normalize_text(data.decode("utf-8", errors="ignore"))
+        elif extension == ".docx":
+            text = _extract_docx_text(data)
+        elif extension == ".xlsx":
+            text = _extract_xlsx_text(data)
+        elif extension in IMAGE_EXTENSIONS:
+            return _extract_handwriting_text(data=data, mime_type=mime_type)
+        else:
+            return ExtractionResult(
+                text="",
+                preview_bytes=None,
+                preview_suffix=None,
+                status="unsupported",
+                metadata_json={"reason": "unsupported_format"},
+            )
+    except Exception as error:
+        return ExtractionResult(
+            text="",
+            preview_bytes=None,
+            preview_suffix=None,
+            status="error",
+            metadata_json={"reason": "extraction_exception", "error": str(error)},
+        )
+
+    return ExtractionResult(
+        text=text[: settings.max_text_length],
+        preview_bytes=preview_bytes,
+        preview_suffix=preview_suffix,
+        status="processed",
+        metadata_json={},
+    )
+
+
+def extract_archive_members(data: bytes, depth: int = 0) -> list[ArchiveMember]:
+    """Extracts processable members from zip archives with configurable depth limits."""
+
+    members: list[ArchiveMember] = []
+    if depth > settings.max_zip_depth:
+        return members
+
+    with zipfile.ZipFile(io.BytesIO(data)) as archive:
+        infos = [info for info in archive.infolist() if not info.is_dir()][: settings.max_zip_members]
+        for info in infos:
+            member_data = archive.read(info.filename)
+            members.append(ArchiveMember(name=info.filename, data=member_data))
+
+    return members