Initial commit
This commit is contained in:
315
backend/app/services/extractor.py
Normal file
315
backend/app/services/extractor.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""Document extraction service for text indexing, previews, and archive fan-out."""
|
||||
|
||||
import io
|
||||
import re
|
||||
import zipfile
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import magic
|
||||
from docx import Document as DocxDocument
|
||||
from openpyxl import load_workbook
|
||||
from PIL import Image, ImageOps
|
||||
from pypdf import PdfReader
|
||||
import pymupdf
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.services.handwriting import (
|
||||
IMAGE_TEXT_TYPE_NO_TEXT,
|
||||
IMAGE_TEXT_TYPE_UNKNOWN,
|
||||
HandwritingTranscriptionError,
|
||||
HandwritingTranscriptionNotConfiguredError,
|
||||
HandwritingTranscriptionTimeoutError,
|
||||
classify_image_text_bytes,
|
||||
transcribe_handwriting_bytes,
|
||||
)
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
IMAGE_EXTENSIONS = {
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".tif",
|
||||
".tiff",
|
||||
".bmp",
|
||||
".gif",
|
||||
".webp",
|
||||
".heic",
|
||||
}
|
||||
|
||||
SUPPORTED_TEXT_EXTENSIONS = {
|
||||
".txt",
|
||||
".md",
|
||||
".csv",
|
||||
".json",
|
||||
".xml",
|
||||
".svg",
|
||||
".pdf",
|
||||
".docx",
|
||||
".xlsx",
|
||||
*IMAGE_EXTENSIONS,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionResult:
|
||||
"""Represents output generated during extraction for a single file."""
|
||||
|
||||
text: str
|
||||
preview_bytes: bytes | None
|
||||
preview_suffix: str | None
|
||||
status: str
|
||||
metadata_json: dict[str, object] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ArchiveMember:
|
||||
"""Represents an extracted file entry from an archive."""
|
||||
|
||||
name: str
|
||||
data: bytes
|
||||
|
||||
|
||||
def sniff_mime(data: bytes) -> str:
|
||||
"""Detects MIME type using libmagic for robust format handling."""
|
||||
|
||||
return magic.from_buffer(data, mime=True) or "application/octet-stream"
|
||||
|
||||
|
||||
def is_supported_for_extraction(extension: str, mime_type: str) -> bool:
|
||||
"""Determines if a file should be text-processed for indexing and classification."""
|
||||
|
||||
return extension in SUPPORTED_TEXT_EXTENSIONS or mime_type.startswith("text/")
|
||||
|
||||
|
||||
def _normalize_text(text: str) -> str:
|
||||
"""Normalizes extracted text by removing repeated form separators and controls."""
|
||||
|
||||
cleaned = text.replace("\r", "\n").replace("\x00", "")
|
||||
lines: list[str] = []
|
||||
for line in cleaned.split("\n"):
|
||||
stripped = line.strip()
|
||||
if stripped and re.fullmatch(r"[.\-_*=~\s]{4,}", stripped):
|
||||
continue
|
||||
lines.append(line)
|
||||
|
||||
normalized = "\n".join(lines)
|
||||
normalized = re.sub(r"\n{3,}", "\n\n", normalized)
|
||||
return normalized.strip()
|
||||
|
||||
|
||||
def _extract_pdf_text(data: bytes) -> str:
|
||||
"""Extracts text from PDF bytes using pypdf page parsing."""
|
||||
|
||||
reader = PdfReader(io.BytesIO(data))
|
||||
pages: list[str] = []
|
||||
for page in reader.pages:
|
||||
pages.append(page.extract_text() or "")
|
||||
return _normalize_text("\n".join(pages))
|
||||
|
||||
|
||||
def _extract_pdf_preview(data: bytes) -> tuple[bytes | None, str | None]:
|
||||
"""Creates a JPEG thumbnail preview from the first PDF page."""
|
||||
|
||||
try:
|
||||
document = pymupdf.open(stream=data, filetype="pdf")
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
try:
|
||||
if document.page_count < 1:
|
||||
return None, None
|
||||
page = document.load_page(0)
|
||||
pixmap = page.get_pixmap(matrix=pymupdf.Matrix(1.5, 1.5), alpha=False)
|
||||
return pixmap.tobytes("jpeg"), ".jpg"
|
||||
except Exception:
|
||||
return None, None
|
||||
finally:
|
||||
document.close()
|
||||
|
||||
|
||||
def _extract_docx_text(data: bytes) -> str:
|
||||
"""Extracts paragraph text from DOCX content."""
|
||||
|
||||
document = DocxDocument(io.BytesIO(data))
|
||||
return _normalize_text("\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text))
|
||||
|
||||
|
||||
def _extract_xlsx_text(data: bytes) -> str:
|
||||
"""Extracts cell text from XLSX workbook sheets for indexing."""
|
||||
|
||||
workbook = load_workbook(io.BytesIO(data), data_only=True, read_only=True)
|
||||
chunks: list[str] = []
|
||||
for sheet in workbook.worksheets:
|
||||
chunks.append(sheet.title)
|
||||
row_count = 0
|
||||
for row in sheet.iter_rows(min_row=1, max_row=200):
|
||||
row_values = [str(cell.value) for cell in row if cell.value is not None]
|
||||
if row_values:
|
||||
chunks.append(" ".join(row_values))
|
||||
row_count += 1
|
||||
if row_count >= 200:
|
||||
break
|
||||
return _normalize_text("\n".join(chunks))
|
||||
|
||||
|
||||
def _build_image_preview(data: bytes) -> tuple[bytes | None, str | None]:
|
||||
"""Builds a JPEG preview thumbnail for image files."""
|
||||
|
||||
try:
|
||||
with Image.open(io.BytesIO(data)) as image:
|
||||
preview = ImageOps.exif_transpose(image).convert("RGB")
|
||||
preview.thumbnail((600, 600))
|
||||
output = io.BytesIO()
|
||||
preview.save(output, format="JPEG", optimize=True, quality=82)
|
||||
return output.getvalue(), ".jpg"
|
||||
except Exception:
|
||||
return None, None
|
||||
|
||||
|
||||
def _extract_handwriting_text(data: bytes, mime_type: str) -> ExtractionResult:
|
||||
"""Extracts text from image bytes and records handwriting-vs-printed classification metadata."""
|
||||
|
||||
preview_bytes, preview_suffix = _build_image_preview(data)
|
||||
metadata_json: dict[str, object] = {}
|
||||
|
||||
try:
|
||||
text_type = classify_image_text_bytes(data, mime_type=mime_type)
|
||||
metadata_json = {
|
||||
"image_text_type": text_type.label,
|
||||
"image_text_type_confidence": text_type.confidence,
|
||||
"image_text_type_provider": text_type.provider,
|
||||
"image_text_type_model": text_type.model,
|
||||
}
|
||||
except HandwritingTranscriptionNotConfiguredError as error:
|
||||
return ExtractionResult(
|
||||
text="",
|
||||
preview_bytes=preview_bytes,
|
||||
preview_suffix=preview_suffix,
|
||||
status="unsupported",
|
||||
metadata_json={"transcription_error": str(error), "image_text_type": IMAGE_TEXT_TYPE_UNKNOWN},
|
||||
)
|
||||
except HandwritingTranscriptionTimeoutError as error:
|
||||
metadata_json = {
|
||||
"image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
|
||||
"image_text_type_error": str(error),
|
||||
}
|
||||
except HandwritingTranscriptionError as error:
|
||||
metadata_json = {
|
||||
"image_text_type": IMAGE_TEXT_TYPE_UNKNOWN,
|
||||
"image_text_type_error": str(error),
|
||||
}
|
||||
|
||||
if metadata_json.get("image_text_type") == IMAGE_TEXT_TYPE_NO_TEXT:
|
||||
metadata_json["transcription_skipped"] = "no_text_detected"
|
||||
return ExtractionResult(
|
||||
text="",
|
||||
preview_bytes=preview_bytes,
|
||||
preview_suffix=preview_suffix,
|
||||
status="processed",
|
||||
metadata_json=metadata_json,
|
||||
)
|
||||
|
||||
try:
|
||||
transcription = transcribe_handwriting_bytes(data, mime_type=mime_type)
|
||||
transcription_metadata: dict[str, object] = {
|
||||
"transcription_provider": transcription.provider,
|
||||
"transcription_model": transcription.model,
|
||||
"transcription_uncertainties": transcription.uncertainties,
|
||||
}
|
||||
return ExtractionResult(
|
||||
text=_normalize_text(transcription.text),
|
||||
preview_bytes=preview_bytes,
|
||||
preview_suffix=preview_suffix,
|
||||
status="processed",
|
||||
metadata_json={**metadata_json, **transcription_metadata},
|
||||
)
|
||||
except HandwritingTranscriptionNotConfiguredError as error:
|
||||
return ExtractionResult(
|
||||
text="",
|
||||
preview_bytes=preview_bytes,
|
||||
preview_suffix=preview_suffix,
|
||||
status="unsupported",
|
||||
metadata_json={**metadata_json, "transcription_error": str(error)},
|
||||
)
|
||||
except HandwritingTranscriptionTimeoutError as error:
|
||||
return ExtractionResult(
|
||||
text="",
|
||||
preview_bytes=preview_bytes,
|
||||
preview_suffix=preview_suffix,
|
||||
status="error",
|
||||
metadata_json={**metadata_json, "transcription_error": str(error)},
|
||||
)
|
||||
except HandwritingTranscriptionError as error:
|
||||
return ExtractionResult(
|
||||
text="",
|
||||
preview_bytes=preview_bytes,
|
||||
preview_suffix=preview_suffix,
|
||||
status="error",
|
||||
metadata_json={**metadata_json, "transcription_error": str(error)},
|
||||
)
|
||||
|
||||
|
||||
def extract_text_content(filename: str, data: bytes, mime_type: str) -> ExtractionResult:
|
||||
"""Extracts text and optional preview bytes for supported file types."""
|
||||
|
||||
extension = Path(filename).suffix.lower()
|
||||
text = ""
|
||||
preview_bytes: bytes | None = None
|
||||
preview_suffix: str | None = None
|
||||
|
||||
try:
|
||||
if extension == ".pdf":
|
||||
text = _extract_pdf_text(data)
|
||||
preview_bytes, preview_suffix = _extract_pdf_preview(data)
|
||||
elif extension in {".txt", ".md", ".csv", ".json", ".xml", ".svg"} or mime_type.startswith("text/"):
|
||||
text = _normalize_text(data.decode("utf-8", errors="ignore"))
|
||||
elif extension == ".docx":
|
||||
text = _extract_docx_text(data)
|
||||
elif extension == ".xlsx":
|
||||
text = _extract_xlsx_text(data)
|
||||
elif extension in IMAGE_EXTENSIONS:
|
||||
return _extract_handwriting_text(data=data, mime_type=mime_type)
|
||||
else:
|
||||
return ExtractionResult(
|
||||
text="",
|
||||
preview_bytes=None,
|
||||
preview_suffix=None,
|
||||
status="unsupported",
|
||||
metadata_json={"reason": "unsupported_format"},
|
||||
)
|
||||
except Exception as error:
|
||||
return ExtractionResult(
|
||||
text="",
|
||||
preview_bytes=None,
|
||||
preview_suffix=None,
|
||||
status="error",
|
||||
metadata_json={"reason": "extraction_exception", "error": str(error)},
|
||||
)
|
||||
|
||||
return ExtractionResult(
|
||||
text=text[: settings.max_text_length],
|
||||
preview_bytes=preview_bytes,
|
||||
preview_suffix=preview_suffix,
|
||||
status="processed",
|
||||
metadata_json={},
|
||||
)
|
||||
|
||||
|
||||
def extract_archive_members(data: bytes, depth: int = 0) -> list[ArchiveMember]:
|
||||
"""Extracts processable members from zip archives with configurable depth limits."""
|
||||
|
||||
members: list[ArchiveMember] = []
|
||||
if depth > settings.max_zip_depth:
|
||||
return members
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(data)) as archive:
|
||||
infos = [info for info in archive.infolist() if not info.is_dir()][: settings.max_zip_members]
|
||||
for info in infos:
|
||||
member_data = archive.read(info.filename)
|
||||
members.append(ArchiveMember(name=info.filename, data=member_data))
|
||||
|
||||
return members
|
||||
Reference in New Issue
Block a user