Initial commit

This commit is contained in:
2026-02-21 09:44:18 -03:00
commit 5dfc2cbd85
65 changed files with 11989 additions and 0 deletions

View File

@@ -0,0 +1,315 @@
"""Document extraction service for text indexing, previews, and archive fan-out."""
import io
import re
import zipfile
from dataclasses import dataclass, field
from pathlib import Path
import magic
from docx import Document as DocxDocument
from openpyxl import load_workbook
from PIL import Image, ImageOps
from pypdf import PdfReader
import pymupdf
from app.core.config import get_settings
from app.services.handwriting import (
IMAGE_TEXT_TYPE_NO_TEXT,
IMAGE_TEXT_TYPE_UNKNOWN,
HandwritingTranscriptionError,
HandwritingTranscriptionNotConfiguredError,
HandwritingTranscriptionTimeoutError,
classify_image_text_bytes,
transcribe_handwriting_bytes,
)
settings = get_settings()
IMAGE_EXTENSIONS = {
".jpg",
".jpeg",
".png",
".tif",
".tiff",
".bmp",
".gif",
".webp",
".heic",
}
SUPPORTED_TEXT_EXTENSIONS = {
".txt",
".md",
".csv",
".json",
".xml",
".svg",
".pdf",
".docx",
".xlsx",
*IMAGE_EXTENSIONS,
}
@dataclass
class ExtractionResult:
"""Represents output generated during extraction for a single file."""
text: str
preview_bytes: bytes | None
preview_suffix: str | None
status: str
metadata_json: dict[str, object] = field(default_factory=dict)
@dataclass
class ArchiveMember:
"""Represents an extracted file entry from an archive."""
name: str
data: bytes
def sniff_mime(data: bytes) -> str:
"""Detects MIME type using libmagic for robust format handling."""
return magic.from_buffer(data, mime=True) or "application/octet-stream"
def is_supported_for_extraction(extension: str, mime_type: str) -> bool:
"""Determines if a file should be text-processed for indexing and classification."""
return extension in SUPPORTED_TEXT_EXTENSIONS or mime_type.startswith("text/")
def _normalize_text(text: str) -> str:
"""Normalizes extracted text by removing repeated form separators and controls."""
cleaned = text.replace("\r", "\n").replace("\x00", "")
lines: list[str] = []
for line in cleaned.split("\n"):
stripped = line.strip()
if stripped and re.fullmatch(r"[.\-_*=~\s]{4,}", stripped):
continue
lines.append(line)
normalized = "\n".join(lines)
normalized = re.sub(r"\n{3,}", "\n\n", normalized)
return normalized.strip()
def _extract_pdf_text(data: bytes) -> str:
"""Extracts text from PDF bytes using pypdf page parsing."""
reader = PdfReader(io.BytesIO(data))
pages: list[str] = []
for page in reader.pages:
pages.append(page.extract_text() or "")
return _normalize_text("\n".join(pages))
def _extract_pdf_preview(data: bytes) -> tuple[bytes | None, str | None]:
"""Creates a JPEG thumbnail preview from the first PDF page."""
try:
document = pymupdf.open(stream=data, filetype="pdf")
except Exception:
return None, None
try:
if document.page_count < 1:
return None, None
page = document.load_page(0)
pixmap = page.get_pixmap(matrix=pymupdf.Matrix(1.5, 1.5), alpha=False)
return pixmap.tobytes("jpeg"), ".jpg"
except Exception:
return None, None
finally:
document.close()
def _extract_docx_text(data: bytes) -> str:
"""Extracts paragraph text from DOCX content."""
document = DocxDocument(io.BytesIO(data))
return _normalize_text("\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text))
def _extract_xlsx_text(data: bytes) -> str:
"""Extracts cell text from XLSX workbook sheets for indexing."""
workbook = load_workbook(io.BytesIO(data), data_only=True, read_only=True)
chunks: list[str] = []
for sheet in workbook.worksheets:
chunks.append(sheet.title)
row_count = 0
for row in sheet.iter_rows(min_row=1, max_row=200):
row_values = [str(cell.value) for cell in row if cell.value is not None]
if row_values:
chunks.append(" ".join(row_values))
row_count += 1
if row_count >= 200:
break
return _normalize_text("\n".join(chunks))
def _build_image_preview(data: bytes) -> tuple[bytes | None, str | None]:
"""Builds a JPEG preview thumbnail for image files."""
try:
with Image.open(io.BytesIO(data)) as image:
preview = ImageOps.exif_transpose(image).convert("RGB")
preview.thumbnail((600, 600))
output = io.BytesIO()
preview.save(output, format="JPEG", optimize=True, quality=82)
return output.getvalue(), ".jpg"
except Exception:
return None, None
def _extract_handwriting_text(data: bytes, mime_type: str) -> ExtractionResult:
"""Extracts text from image bytes and records handwriting-vs-printed classification metadata."""
preview_bytes, preview_suffix = _build_image_preview(data)
metadata_json: dict[str, object] = {}
try:
text_type = classify_image_text_bytes(data, mime_type=mime_type)
metadata_json = {
"image_text_type": text_type.label,
"image_text_type_confidence": text_type.confidence,
"image_text_type_provider": text_type.provider,
"image_text_type_model": text_type.model,
}
except HandwritingTranscriptionNotConfiguredError as error:
return ExtractionResult(
text="",
preview_bytes=preview_bytes,
preview_suffix=preview_suffix,
status="unsupported",
metadata_json={"transcription_error": str(error), "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN},
)
except HandwritingTranscriptionTimeoutError as error:
metadata_json = {
"image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
"image_text_type_error": str(error),
}
except HandwritingTranscriptionError as error:
metadata_json = {
"image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
"image_text_type_error": str(error),
}
if metadata_json.get("image_text_type") == IMAGE_TEXT_TYPE_NO_TEXT:
metadata_json["transcription_skipped"] = "no_text_detected"
return ExtractionResult(
text="",
preview_bytes=preview_bytes,
preview_suffix=preview_suffix,
status="processed",
metadata_json=metadata_json,
)
try:
transcription = transcribe_handwriting_bytes(data, mime_type=mime_type)
transcription_metadata: dict[str, object] = {
"transcription_provider": transcription.provider,
"transcription_model": transcription.model,
"transcription_uncertainties": transcription.uncertainties,
}
return ExtractionResult(
text=_normalize_text(transcription.text),
preview_bytes=preview_bytes,
preview_suffix=preview_suffix,
status="processed",
metadata_json={**metadata_json, **transcription_metadata},
)
except HandwritingTranscriptionNotConfiguredError as error:
return ExtractionResult(
text="",
preview_bytes=preview_bytes,
preview_suffix=preview_suffix,
status="unsupported",
metadata_json={**metadata_json, "transcription_error": str(error)},
)
except HandwritingTranscriptionTimeoutError as error:
return ExtractionResult(
text="",
preview_bytes=preview_bytes,
preview_suffix=preview_suffix,
status="error",
metadata_json={**metadata_json, "transcription_error": str(error)},
)
except HandwritingTranscriptionError as error:
return ExtractionResult(
text="",
preview_bytes=preview_bytes,
preview_suffix=preview_suffix,
status="error",
metadata_json={**metadata_json, "transcription_error": str(error)},
)
def extract_text_content(filename: str, data: bytes, mime_type: str) -> ExtractionResult:
"""Extracts text and optional preview bytes for supported file types."""
extension = Path(filename).suffix.lower()
text = ""
preview_bytes: bytes | None = None
preview_suffix: str | None = None
try:
if extension == ".pdf":
text = _extract_pdf_text(data)
preview_bytes, preview_suffix = _extract_pdf_preview(data)
elif extension in {".txt", ".md", ".csv", ".json", ".xml", ".svg"} or mime_type.startswith("text/"):
text = _normalize_text(data.decode("utf-8", errors="ignore"))
elif extension == ".docx":
text = _extract_docx_text(data)
elif extension == ".xlsx":
text = _extract_xlsx_text(data)
elif extension in IMAGE_EXTENSIONS:
return _extract_handwriting_text(data=data, mime_type=mime_type)
else:
return ExtractionResult(
text="",
preview_bytes=None,
preview_suffix=None,
status="unsupported",
metadata_json={"reason": "unsupported_format"},
)
except Exception as error:
return ExtractionResult(
text="",
preview_bytes=None,
preview_suffix=None,
status="error",
metadata_json={"reason": "extraction_exception", "error": str(error)},
)
return ExtractionResult(
text=text[: settings.max_text_length],
preview_bytes=preview_bytes,
preview_suffix=preview_suffix,
status="processed",
metadata_json={},
)
def extract_archive_members(data: bytes, depth: int = 0) -> list[ArchiveMember]:
"""Extracts processable members from zip archives with configurable depth limits."""
members: list[ArchiveMember] = []
if depth > settings.max_zip_depth:
return members
with zipfile.ZipFile(io.BytesIO(data)) as archive:
infos = [info for info in archive.infolist() if not info.is_dir()][: settings.max_zip_members]
for info in infos:
member_data = archive.read(info.filename)
members.append(ArchiveMember(name=info.filename, data=member_data))
return members