346 lines
11 KiB
Python
346 lines
11 KiB
Python
"""Document extraction service for text indexing, previews, and archive fan-out."""
|
|
|
|
import io
|
|
import re
|
|
import zipfile
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
import magic
|
|
from docx import Document as DocxDocument
|
|
from openpyxl import load_workbook
|
|
from PIL import Image, ImageOps
|
|
from pypdf import PdfReader
|
|
import pymupdf
|
|
|
|
from app.core.config import get_settings
|
|
from app.services.handwriting import (
|
|
IMAGE_TEXT_TYPE_NO_TEXT,
|
|
IMAGE_TEXT_TYPE_UNKNOWN,
|
|
HandwritingTranscriptionError,
|
|
HandwritingTranscriptionNotConfiguredError,
|
|
HandwritingTranscriptionTimeoutError,
|
|
classify_image_text_bytes,
|
|
transcribe_handwriting_bytes,
|
|
)
|
|
|
|
|
|
settings = get_settings()
|
|
|
|
|
|
IMAGE_EXTENSIONS = {
|
|
".jpg",
|
|
".jpeg",
|
|
".png",
|
|
".tif",
|
|
".tiff",
|
|
".bmp",
|
|
".gif",
|
|
".webp",
|
|
".heic",
|
|
}
|
|
|
|
SUPPORTED_TEXT_EXTENSIONS = {
|
|
".txt",
|
|
".md",
|
|
".csv",
|
|
".json",
|
|
".xml",
|
|
".svg",
|
|
".pdf",
|
|
".docx",
|
|
".xlsx",
|
|
*IMAGE_EXTENSIONS,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Represents output generated during extraction for a single file."""
|
|
|
|
text: str
|
|
preview_bytes: bytes | None
|
|
preview_suffix: str | None
|
|
status: str
|
|
metadata_json: dict[str, object] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ArchiveMember:
|
|
"""Represents an extracted file entry from an archive."""
|
|
|
|
name: str
|
|
data: bytes
|
|
|
|
|
|
def sniff_mime(data: bytes) -> str:
|
|
"""Detects MIME type using libmagic for robust format handling."""
|
|
|
|
return magic.from_buffer(data, mime=True) or "application/octet-stream"
|
|
|
|
|
|
def is_supported_for_extraction(extension: str, mime_type: str) -> bool:
|
|
"""Determines if a file should be text-processed for indexing and classification."""
|
|
|
|
return extension in SUPPORTED_TEXT_EXTENSIONS or mime_type.startswith("text/")
|
|
|
|
|
|
def _normalize_text(text: str) -> str:
|
|
"""Normalizes extracted text by removing repeated form separators and controls."""
|
|
|
|
cleaned = text.replace("\r", "\n").replace("\x00", "")
|
|
lines: list[str] = []
|
|
for line in cleaned.split("\n"):
|
|
stripped = line.strip()
|
|
if stripped and re.fullmatch(r"[.\-_*=~\s]{4,}", stripped):
|
|
continue
|
|
lines.append(line)
|
|
|
|
normalized = "\n".join(lines)
|
|
normalized = re.sub(r"\n{3,}", "\n\n", normalized)
|
|
return normalized.strip()
|
|
|
|
|
|
def _extract_pdf_text(data: bytes) -> str:
|
|
"""Extracts text from PDF bytes using pypdf page parsing."""
|
|
|
|
reader = PdfReader(io.BytesIO(data))
|
|
pages: list[str] = []
|
|
for page in reader.pages:
|
|
pages.append(page.extract_text() or "")
|
|
return _normalize_text("\n".join(pages))
|
|
|
|
|
|
def _extract_pdf_preview(data: bytes) -> tuple[bytes | None, str | None]:
|
|
"""Creates a JPEG thumbnail preview from the first PDF page."""
|
|
|
|
try:
|
|
document = pymupdf.open(stream=data, filetype="pdf")
|
|
except Exception:
|
|
return None, None
|
|
|
|
try:
|
|
if document.page_count < 1:
|
|
return None, None
|
|
page = document.load_page(0)
|
|
pixmap = page.get_pixmap(matrix=pymupdf.Matrix(1.5, 1.5), alpha=False)
|
|
return pixmap.tobytes("jpeg"), ".jpg"
|
|
except Exception:
|
|
return None, None
|
|
finally:
|
|
document.close()
|
|
|
|
|
|
def _extract_docx_text(data: bytes) -> str:
|
|
"""Extracts paragraph text from DOCX content."""
|
|
|
|
document = DocxDocument(io.BytesIO(data))
|
|
return _normalize_text("\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text))
|
|
|
|
|
|
def _extract_xlsx_text(data: bytes) -> str:
|
|
"""Extracts cell text from XLSX workbook sheets for indexing."""
|
|
|
|
workbook = load_workbook(io.BytesIO(data), data_only=True, read_only=True)
|
|
chunks: list[str] = []
|
|
for sheet in workbook.worksheets:
|
|
chunks.append(sheet.title)
|
|
row_count = 0
|
|
for row in sheet.iter_rows(min_row=1, max_row=200):
|
|
row_values = [str(cell.value) for cell in row if cell.value is not None]
|
|
if row_values:
|
|
chunks.append(" ".join(row_values))
|
|
row_count += 1
|
|
if row_count >= 200:
|
|
break
|
|
return _normalize_text("\n".join(chunks))
|
|
|
|
|
|
def _build_image_preview(data: bytes) -> tuple[bytes | None, str | None]:
|
|
"""Builds a JPEG preview thumbnail for image files."""
|
|
|
|
try:
|
|
with Image.open(io.BytesIO(data)) as image:
|
|
preview = ImageOps.exif_transpose(image).convert("RGB")
|
|
preview.thumbnail((600, 600))
|
|
output = io.BytesIO()
|
|
preview.save(output, format="JPEG", optimize=True, quality=82)
|
|
return output.getvalue(), ".jpg"
|
|
except Exception:
|
|
return None, None
|
|
|
|
|
|
def _extract_handwriting_text(data: bytes, mime_type: str) -> ExtractionResult:
|
|
"""Extracts text from image bytes and records handwriting-vs-printed classification metadata."""
|
|
|
|
preview_bytes, preview_suffix = _build_image_preview(data)
|
|
metadata_json: dict[str, object] = {}
|
|
|
|
try:
|
|
text_type = classify_image_text_bytes(data, mime_type=mime_type)
|
|
metadata_json = {
|
|
"image_text_type": text_type.label,
|
|
"image_text_type_confidence": text_type.confidence,
|
|
"image_text_type_provider": text_type.provider,
|
|
"image_text_type_model": text_type.model,
|
|
}
|
|
except HandwritingTranscriptionNotConfiguredError as error:
|
|
return ExtractionResult(
|
|
text="",
|
|
preview_bytes=preview_bytes,
|
|
preview_suffix=preview_suffix,
|
|
status="unsupported",
|
|
metadata_json={"transcription_error": str(error), "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN},
|
|
)
|
|
except HandwritingTranscriptionTimeoutError as error:
|
|
metadata_json = {
|
|
"image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
|
|
"image_text_type_error": str(error),
|
|
}
|
|
except HandwritingTranscriptionError as error:
|
|
metadata_json = {
|
|
"image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
|
|
"image_text_type_error": str(error),
|
|
}
|
|
|
|
if metadata_json.get("image_text_type") == IMAGE_TEXT_TYPE_NO_TEXT:
|
|
metadata_json["transcription_skipped"] = "no_text_detected"
|
|
return ExtractionResult(
|
|
text="",
|
|
preview_bytes=preview_bytes,
|
|
preview_suffix=preview_suffix,
|
|
status="processed",
|
|
metadata_json=metadata_json,
|
|
)
|
|
|
|
try:
|
|
transcription = transcribe_handwriting_bytes(data, mime_type=mime_type)
|
|
transcription_metadata: dict[str, object] = {
|
|
"transcription_provider": transcription.provider,
|
|
"transcription_model": transcription.model,
|
|
"transcription_uncertainties": transcription.uncertainties,
|
|
}
|
|
return ExtractionResult(
|
|
text=_normalize_text(transcription.text),
|
|
preview_bytes=preview_bytes,
|
|
preview_suffix=preview_suffix,
|
|
status="processed",
|
|
metadata_json={**metadata_json, **transcription_metadata},
|
|
)
|
|
except HandwritingTranscriptionNotConfiguredError as error:
|
|
return ExtractionResult(
|
|
text="",
|
|
preview_bytes=preview_bytes,
|
|
preview_suffix=preview_suffix,
|
|
status="unsupported",
|
|
metadata_json={**metadata_json, "transcription_error": str(error)},
|
|
)
|
|
except HandwritingTranscriptionTimeoutError as error:
|
|
return ExtractionResult(
|
|
text="",
|
|
preview_bytes=preview_bytes,
|
|
preview_suffix=preview_suffix,
|
|
status="error",
|
|
metadata_json={**metadata_json, "transcription_error": str(error)},
|
|
)
|
|
except HandwritingTranscriptionError as error:
|
|
return ExtractionResult(
|
|
text="",
|
|
preview_bytes=preview_bytes,
|
|
preview_suffix=preview_suffix,
|
|
status="error",
|
|
metadata_json={**metadata_json, "transcription_error": str(error)},
|
|
)
|
|
|
|
|
|
def extract_text_content(filename: str, data: bytes, mime_type: str) -> ExtractionResult:
|
|
"""Extracts text and optional preview bytes for supported file types."""
|
|
|
|
extension = Path(filename).suffix.lower()
|
|
text = ""
|
|
preview_bytes: bytes | None = None
|
|
preview_suffix: str | None = None
|
|
|
|
try:
|
|
if extension == ".pdf":
|
|
text = _extract_pdf_text(data)
|
|
preview_bytes, preview_suffix = _extract_pdf_preview(data)
|
|
elif extension in {".txt", ".md", ".csv", ".json", ".xml", ".svg"} or mime_type.startswith("text/"):
|
|
text = _normalize_text(data.decode("utf-8", errors="ignore"))
|
|
elif extension == ".docx":
|
|
text = _extract_docx_text(data)
|
|
elif extension == ".xlsx":
|
|
text = _extract_xlsx_text(data)
|
|
elif extension in IMAGE_EXTENSIONS:
|
|
return _extract_handwriting_text(data=data, mime_type=mime_type)
|
|
else:
|
|
return ExtractionResult(
|
|
text="",
|
|
preview_bytes=None,
|
|
preview_suffix=None,
|
|
status="unsupported",
|
|
metadata_json={"reason": "unsupported_format"},
|
|
)
|
|
except Exception as error:
|
|
return ExtractionResult(
|
|
text="",
|
|
preview_bytes=None,
|
|
preview_suffix=None,
|
|
status="error",
|
|
metadata_json={"reason": "extraction_exception", "error": str(error)},
|
|
)
|
|
|
|
return ExtractionResult(
|
|
text=text[: settings.max_text_length],
|
|
preview_bytes=preview_bytes,
|
|
preview_suffix=preview_suffix,
|
|
status="processed",
|
|
metadata_json={},
|
|
)
|
|
|
|
|
|
def extract_archive_members(data: bytes, depth: int = 0, max_members: int | None = None) -> list[ArchiveMember]:
|
|
"""Extracts processable ZIP members with depth-aware and decompression safety guardrails."""
|
|
|
|
members: list[ArchiveMember] = []
|
|
normalized_depth = max(0, depth)
|
|
if normalized_depth >= settings.max_zip_depth:
|
|
return members
|
|
|
|
member_limit = settings.max_zip_members
|
|
if max_members is not None:
|
|
member_limit = max(0, min(settings.max_zip_members, int(max_members)))
|
|
if member_limit <= 0:
|
|
return members
|
|
|
|
total_uncompressed_bytes = 0
|
|
try:
|
|
with zipfile.ZipFile(io.BytesIO(data)) as archive:
|
|
infos = [info for info in archive.infolist() if not info.is_dir()][:member_limit]
|
|
for info in infos:
|
|
if info.file_size <= 0:
|
|
continue
|
|
if info.file_size > settings.max_zip_member_uncompressed_bytes:
|
|
continue
|
|
if total_uncompressed_bytes + info.file_size > settings.max_zip_total_uncompressed_bytes:
|
|
continue
|
|
|
|
compressed_size = max(1, int(info.compress_size))
|
|
compression_ratio = float(info.file_size) / float(compressed_size)
|
|
if compression_ratio > settings.max_zip_compression_ratio:
|
|
continue
|
|
|
|
with archive.open(info, mode="r") as archive_member:
|
|
member_data = archive_member.read(settings.max_zip_member_uncompressed_bytes + 1)
|
|
if len(member_data) > settings.max_zip_member_uncompressed_bytes:
|
|
continue
|
|
if total_uncompressed_bytes + len(member_data) > settings.max_zip_total_uncompressed_bytes:
|
|
continue
|
|
|
|
total_uncompressed_bytes += len(member_data)
|
|
members.append(ArchiveMember(name=info.filename, data=member_data))
|
|
except zipfile.BadZipFile:
|
|
return []
|
|
|
|
return members
|