"""Document extraction service for text indexing, previews, and archive fan-out.""" import io import re import zipfile from dataclasses import dataclass, field from pathlib import Path import magic from docx import Document as DocxDocument from openpyxl import load_workbook from PIL import Image, ImageOps from pypdf import PdfReader import pymupdf from app.core.config import get_settings from app.services.handwriting import ( IMAGE_TEXT_TYPE_NO_TEXT, IMAGE_TEXT_TYPE_UNKNOWN, HandwritingTranscriptionError, HandwritingTranscriptionNotConfiguredError, HandwritingTranscriptionTimeoutError, classify_image_text_bytes, transcribe_handwriting_bytes, ) settings = get_settings() IMAGE_EXTENSIONS = { ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp", ".heic", } SUPPORTED_TEXT_EXTENSIONS = { ".txt", ".md", ".csv", ".json", ".xml", ".svg", ".pdf", ".docx", ".xlsx", *IMAGE_EXTENSIONS, } @dataclass class ExtractionResult: """Represents output generated during extraction for a single file.""" text: str preview_bytes: bytes | None preview_suffix: str | None status: str metadata_json: dict[str, object] = field(default_factory=dict) @dataclass class ArchiveMember: """Represents an extracted file entry from an archive.""" name: str data: bytes def sniff_mime(data: bytes) -> str: """Detects MIME type using libmagic for robust format handling.""" return magic.from_buffer(data, mime=True) or "application/octet-stream" def is_supported_for_extraction(extension: str, mime_type: str) -> bool: """Determines if a file should be text-processed for indexing and classification.""" return extension in SUPPORTED_TEXT_EXTENSIONS or mime_type.startswith("text/") def _normalize_text(text: str) -> str: """Normalizes extracted text by removing repeated form separators and controls.""" cleaned = text.replace("\r", "\n").replace("\x00", "") lines: list[str] = [] for line in cleaned.split("\n"): stripped = line.strip() if stripped and re.fullmatch(r"[.\-_*=~\s]{4,}", stripped): continue lines.append(line) normalized = "\n".join(lines) normalized = re.sub(r"\n{3,}", "\n\n", normalized) return normalized.strip() def _extract_pdf_text(data: bytes) -> str: """Extracts text from PDF bytes using pypdf page parsing.""" reader = PdfReader(io.BytesIO(data)) pages: list[str] = [] for page in reader.pages: pages.append(page.extract_text() or "") return _normalize_text("\n".join(pages)) def _extract_pdf_preview(data: bytes) -> tuple[bytes | None, str | None]: """Creates a JPEG thumbnail preview from the first PDF page.""" try: document = pymupdf.open(stream=data, filetype="pdf") except Exception: return None, None try: if document.page_count < 1: return None, None page = document.load_page(0) pixmap = page.get_pixmap(matrix=pymupdf.Matrix(1.5, 1.5), alpha=False) return pixmap.tobytes("jpeg"), ".jpg" except Exception: return None, None finally: document.close() def _extract_docx_text(data: bytes) -> str: """Extracts paragraph text from DOCX content.""" document = DocxDocument(io.BytesIO(data)) return _normalize_text("\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text)) def _extract_xlsx_text(data: bytes) -> str: """Extracts cell text from XLSX workbook sheets for indexing.""" workbook = load_workbook(io.BytesIO(data), data_only=True, read_only=True) chunks: list[str] = [] for sheet in workbook.worksheets: chunks.append(sheet.title) row_count = 0 for row in sheet.iter_rows(min_row=1, max_row=200): row_values = [str(cell.value) for cell in row if cell.value is not None] if row_values: chunks.append(" ".join(row_values)) row_count += 1 if row_count >= 200: break return _normalize_text("\n".join(chunks)) def _build_image_preview(data: bytes) -> tuple[bytes | None, str | None]: """Builds a JPEG preview thumbnail for image files.""" try: with Image.open(io.BytesIO(data)) as image: preview = ImageOps.exif_transpose(image).convert("RGB") preview.thumbnail((600, 600)) output = io.BytesIO() preview.save(output, format="JPEG", optimize=True, quality=82) return output.getvalue(), ".jpg" except Exception: return None, None def _extract_handwriting_text(data: bytes, mime_type: str) -> ExtractionResult: """Extracts text from image bytes and records handwriting-vs-printed classification metadata.""" preview_bytes, preview_suffix = _build_image_preview(data) metadata_json: dict[str, object] = {} try: text_type = classify_image_text_bytes(data, mime_type=mime_type) metadata_json = { "image_text_type": text_type.label, "image_text_type_confidence": text_type.confidence, "image_text_type_provider": text_type.provider, "image_text_type_model": text_type.model, } except HandwritingTranscriptionNotConfiguredError as error: return ExtractionResult( text="", preview_bytes=preview_bytes, preview_suffix=preview_suffix, status="unsupported", metadata_json={"transcription_error": str(error), "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN}, ) except HandwritingTranscriptionTimeoutError as error: metadata_json = { "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN, "image_text_type_error": str(error), } except HandwritingTranscriptionError as error: metadata_json = { "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN, "image_text_type_error": str(error), } if metadata_json.get("image_text_type") == IMAGE_TEXT_TYPE_NO_TEXT: metadata_json["transcription_skipped"] = "no_text_detected" return ExtractionResult( text="", preview_bytes=preview_bytes, preview_suffix=preview_suffix, status="processed", metadata_json=metadata_json, ) try: transcription = transcribe_handwriting_bytes(data, mime_type=mime_type) transcription_metadata: dict[str, object] = { "transcription_provider": transcription.provider, "transcription_model": transcription.model, "transcription_uncertainties": transcription.uncertainties, } return ExtractionResult( text=_normalize_text(transcription.text), preview_bytes=preview_bytes, preview_suffix=preview_suffix, status="processed", metadata_json={**metadata_json, **transcription_metadata}, ) except HandwritingTranscriptionNotConfiguredError as error: return ExtractionResult( text="", preview_bytes=preview_bytes, preview_suffix=preview_suffix, status="unsupported", metadata_json={**metadata_json, "transcription_error": str(error)}, ) except HandwritingTranscriptionTimeoutError as error: return ExtractionResult( text="", preview_bytes=preview_bytes, preview_suffix=preview_suffix, status="error", metadata_json={**metadata_json, "transcription_error": str(error)}, ) except HandwritingTranscriptionError as error: return ExtractionResult( text="", preview_bytes=preview_bytes, preview_suffix=preview_suffix, status="error", metadata_json={**metadata_json, "transcription_error": str(error)}, ) def extract_text_content(filename: str, data: bytes, mime_type: str) -> ExtractionResult: """Extracts text and optional preview bytes for supported file types.""" extension = Path(filename).suffix.lower() text = "" preview_bytes: bytes | None = None preview_suffix: str | None = None try: if extension == ".pdf": text = _extract_pdf_text(data) preview_bytes, preview_suffix = _extract_pdf_preview(data) elif extension in {".txt", ".md", ".csv", ".json", ".xml", ".svg"} or mime_type.startswith("text/"): text = _normalize_text(data.decode("utf-8", errors="ignore")) elif extension == ".docx": text = _extract_docx_text(data) elif extension == ".xlsx": text = _extract_xlsx_text(data) elif extension in IMAGE_EXTENSIONS: return _extract_handwriting_text(data=data, mime_type=mime_type) else: return ExtractionResult( text="", preview_bytes=None, preview_suffix=None, status="unsupported", metadata_json={"reason": "unsupported_format"}, ) except Exception as error: return ExtractionResult( text="", preview_bytes=None, preview_suffix=None, status="error", metadata_json={"reason": "extraction_exception", "error": str(error)}, ) return ExtractionResult( text=text[: settings.max_text_length], preview_bytes=preview_bytes, preview_suffix=preview_suffix, status="processed", metadata_json={}, ) def extract_archive_members(data: bytes, depth: int = 0) -> list[ArchiveMember]: """Extracts processable ZIP members within configured decompression safety budgets.""" members: list[ArchiveMember] = [] if depth > settings.max_zip_depth: return members total_uncompressed_bytes = 0 try: with zipfile.ZipFile(io.BytesIO(data)) as archive: infos = [info for info in archive.infolist() if not info.is_dir()][: settings.max_zip_members] for info in infos: if info.file_size <= 0: continue if info.file_size > settings.max_zip_member_uncompressed_bytes: continue if total_uncompressed_bytes + info.file_size > settings.max_zip_total_uncompressed_bytes: continue compressed_size = max(1, int(info.compress_size)) compression_ratio = float(info.file_size) / float(compressed_size) if compression_ratio > settings.max_zip_compression_ratio: continue with archive.open(info, mode="r") as archive_member: member_data = archive_member.read(settings.max_zip_member_uncompressed_bytes + 1) if len(member_data) > settings.max_zip_member_uncompressed_bytes: continue if total_uncompressed_bytes + len(member_data) > settings.max_zip_total_uncompressed_bytes: continue total_uncompressed_bytes += len(member_data) members.append(ArchiveMember(name=info.filename, data=member_data)) except zipfile.BadZipFile: return [] return members