545 lines
23 KiB
Python
545 lines
23 KiB
Python
"""Background worker tasks for extraction, indexing, and archive fan-out."""
|
|
|
|
import uuid
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
|
|
from sqlalchemy import select
|
|
|
|
from app.db.base import SessionLocal
|
|
from app.models.document import Document, DocumentStatus
|
|
from app.services.app_settings import read_handwriting_provider_settings, read_handwriting_style_settings
|
|
from app.services.extractor import (
|
|
IMAGE_EXTENSIONS,
|
|
extract_archive_members,
|
|
extract_text_content,
|
|
is_supported_for_extraction,
|
|
sniff_mime,
|
|
)
|
|
from app.services.handwriting import IMAGE_TEXT_TYPE_HANDWRITING
|
|
from app.services.handwriting_style import (
|
|
assign_handwriting_style,
|
|
delete_handwriting_style_document,
|
|
)
|
|
from app.services.processing_logs import cleanup_processing_logs, log_processing_event, set_processing_log_autocommit
|
|
from app.services.routing_pipeline import (
|
|
apply_routing_decision,
|
|
classify_document_routing,
|
|
summarize_document,
|
|
upsert_semantic_index,
|
|
)
|
|
from app.services.storage import absolute_path, compute_sha256, store_bytes, write_preview
|
|
from app.worker.queue import get_processing_queue
|
|
|
|
|
|
def _create_archive_member_document(
|
|
parent: Document,
|
|
member_name: str,
|
|
member_data: bytes,
|
|
mime_type: str,
|
|
) -> Document:
|
|
"""Creates a child document entity for a file extracted from an uploaded archive."""
|
|
|
|
extension = Path(member_name).suffix.lower()
|
|
stored_relative_path = store_bytes(member_name, member_data)
|
|
return Document(
|
|
original_filename=Path(member_name).name,
|
|
source_relative_path=f"{parent.source_relative_path}/{member_name}".strip("/"),
|
|
stored_relative_path=stored_relative_path,
|
|
mime_type=mime_type,
|
|
extension=extension,
|
|
sha256=compute_sha256(member_data),
|
|
size_bytes=len(member_data),
|
|
logical_path=parent.logical_path,
|
|
tags=list(parent.tags),
|
|
metadata_json={"origin": "archive", "parent": str(parent.id)},
|
|
is_archive_member=True,
|
|
archived_member_path=member_name,
|
|
parent_document_id=parent.id,
|
|
)
|
|
|
|
|
|
def process_document_task(document_id: str) -> None:
|
|
"""Processes one queued document and updates extraction and suggestion fields."""
|
|
|
|
with SessionLocal() as session:
|
|
set_processing_log_autocommit(session, True)
|
|
queue = get_processing_queue()
|
|
document = session.execute(
|
|
select(Document).where(Document.id == uuid.UUID(document_id))
|
|
).scalar_one_or_none()
|
|
if document is None:
|
|
return
|
|
log_processing_event(
|
|
session=session,
|
|
stage="worker",
|
|
event="Document processing started",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"status": document.status.value},
|
|
)
|
|
if document.status == DocumentStatus.TRASHED:
|
|
log_processing_event(
|
|
session=session,
|
|
stage="worker",
|
|
event="Document skipped because it is trashed",
|
|
level="warning",
|
|
document=document,
|
|
)
|
|
session.commit()
|
|
return
|
|
|
|
source_path = absolute_path(document.stored_relative_path)
|
|
data = source_path.read_bytes()
|
|
|
|
if document.extension == ".zip":
|
|
child_ids: list[str] = []
|
|
log_processing_event(
|
|
session=session,
|
|
stage="archive",
|
|
event="Archive extraction started",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"size_bytes": len(data)},
|
|
)
|
|
try:
|
|
members = extract_archive_members(data)
|
|
for member in members:
|
|
mime_type = sniff_mime(member.data)
|
|
child = _create_archive_member_document(
|
|
parent=document,
|
|
member_name=member.name,
|
|
member_data=member.data,
|
|
mime_type=mime_type,
|
|
)
|
|
session.add(child)
|
|
session.flush()
|
|
child_ids.append(str(child.id))
|
|
log_processing_event(
|
|
session=session,
|
|
stage="archive",
|
|
event="Archive member extracted and queued",
|
|
level="info",
|
|
document=child,
|
|
payload_json={
|
|
"parent_document_id": str(document.id),
|
|
"member_name": member.name,
|
|
"member_size_bytes": len(member.data),
|
|
"mime_type": mime_type,
|
|
},
|
|
)
|
|
document.status = DocumentStatus.PROCESSED
|
|
document.extracted_text = f"archive with {len(members)} files"
|
|
log_processing_event(
|
|
session=session,
|
|
stage="archive",
|
|
event="Archive extraction completed",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"member_count": len(members)},
|
|
)
|
|
except Exception as exc:
|
|
document.status = DocumentStatus.ERROR
|
|
document.metadata_json = {**document.metadata_json, "error": str(exc)}
|
|
log_processing_event(
|
|
session=session,
|
|
stage="archive",
|
|
event="Archive extraction failed",
|
|
level="error",
|
|
document=document,
|
|
response_text=str(exc),
|
|
)
|
|
|
|
if document.status == DocumentStatus.PROCESSED:
|
|
try:
|
|
summary_text = summarize_document(session=session, document=document)
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json["summary_text"] = summary_text[:20000]
|
|
document.metadata_json = metadata_json
|
|
routing_decision = classify_document_routing(session=session, document=document, summary_text=summary_text)
|
|
apply_routing_decision(document=document, decision=routing_decision, session=session)
|
|
routing_metadata = document.metadata_json.get("routing", {})
|
|
log_processing_event(
|
|
session=session,
|
|
stage="routing",
|
|
event="Routing decision applied",
|
|
level="info",
|
|
document=document,
|
|
payload_json=routing_metadata if isinstance(routing_metadata, dict) else {},
|
|
)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="indexing",
|
|
event="Typesense upsert started",
|
|
level="info",
|
|
document=document,
|
|
)
|
|
upsert_semantic_index(document=document, summary_text=summary_text)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="indexing",
|
|
event="Typesense upsert completed",
|
|
level="info",
|
|
document=document,
|
|
)
|
|
except Exception as exc:
|
|
document.metadata_json = {
|
|
**document.metadata_json,
|
|
"routing_error": str(exc),
|
|
}
|
|
log_processing_event(
|
|
session=session,
|
|
stage="routing",
|
|
event="Routing or indexing failed for archive document",
|
|
level="error",
|
|
document=document,
|
|
response_text=str(exc),
|
|
)
|
|
document.processed_at = datetime.now(UTC)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="worker",
|
|
event="Document processing completed",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"status": document.status.value},
|
|
)
|
|
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
|
|
session.commit()
|
|
for child_id in child_ids:
|
|
queue.enqueue("app.worker.tasks.process_document_task", child_id)
|
|
for child_id in child_ids:
|
|
log_processing_event(
|
|
session=session,
|
|
stage="archive",
|
|
event="Archive child job enqueued",
|
|
level="info",
|
|
document_id=uuid.UUID(child_id),
|
|
payload_json={"parent_document_id": str(document.id)},
|
|
)
|
|
session.commit()
|
|
return
|
|
|
|
if not is_supported_for_extraction(document.extension, document.mime_type):
|
|
document.status = DocumentStatus.UNSUPPORTED
|
|
document.processed_at = datetime.now(UTC)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="extraction",
|
|
event="Document type unsupported for extraction",
|
|
level="warning",
|
|
document=document,
|
|
payload_json={"extension": document.extension, "mime_type": document.mime_type},
|
|
)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="worker",
|
|
event="Document processing completed",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"status": document.status.value},
|
|
)
|
|
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
|
|
session.commit()
|
|
return
|
|
|
|
if document.extension in IMAGE_EXTENSIONS:
|
|
ocr_settings = read_handwriting_provider_settings()
|
|
log_processing_event(
|
|
session=session,
|
|
stage="ocr",
|
|
event="OCR request started",
|
|
level="info",
|
|
document=document,
|
|
provider_id=str(ocr_settings.get("provider_id", "")),
|
|
model_name=str(ocr_settings.get("openai_model", "")),
|
|
prompt_text=str(ocr_settings.get("prompt", "")),
|
|
payload_json={"mime_type": document.mime_type},
|
|
)
|
|
else:
|
|
log_processing_event(
|
|
session=session,
|
|
stage="extraction",
|
|
event="Text extraction started",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"extension": document.extension, "mime_type": document.mime_type},
|
|
)
|
|
|
|
extraction = extract_text_content(document.original_filename, data, document.mime_type)
|
|
if extraction.preview_bytes and extraction.preview_suffix:
|
|
preview_relative_path = write_preview(str(document.id), extraction.preview_bytes, extraction.preview_suffix)
|
|
document.metadata_json = {**document.metadata_json, "preview_relative_path": preview_relative_path}
|
|
document.preview_available = True
|
|
log_processing_event(
|
|
session=session,
|
|
stage="extraction",
|
|
event="Preview generated",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"preview_relative_path": preview_relative_path},
|
|
)
|
|
if extraction.metadata_json:
|
|
document.metadata_json = {**document.metadata_json, **extraction.metadata_json}
|
|
if document.extension in IMAGE_EXTENSIONS:
|
|
image_text_type = extraction.metadata_json.get("image_text_type")
|
|
if isinstance(image_text_type, str) and image_text_type.strip():
|
|
document.image_text_type = image_text_type.strip()
|
|
else:
|
|
document.image_text_type = None
|
|
else:
|
|
document.image_text_type = None
|
|
document.handwriting_style_id = None
|
|
|
|
if extraction.status == "error":
|
|
document.status = DocumentStatus.ERROR
|
|
document.metadata_json = {**document.metadata_json, "error": "extraction_failed"}
|
|
if document.extension in IMAGE_EXTENSIONS:
|
|
document.handwriting_style_id = None
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json.pop("handwriting_style", None)
|
|
document.metadata_json = metadata_json
|
|
try:
|
|
delete_handwriting_style_document(str(document.id))
|
|
except Exception:
|
|
pass
|
|
document.processed_at = datetime.now(UTC)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="extraction",
|
|
event="Extraction failed",
|
|
level="error",
|
|
document=document,
|
|
response_text=str(extraction.metadata_json.get("error", "extraction_failed")),
|
|
payload_json=extraction.metadata_json,
|
|
)
|
|
if "transcription_error" in extraction.metadata_json:
|
|
log_processing_event(
|
|
session=session,
|
|
stage="ocr",
|
|
event="OCR request failed",
|
|
level="error",
|
|
document=document,
|
|
response_text=str(extraction.metadata_json.get("transcription_error", "")),
|
|
)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="worker",
|
|
event="Document processing completed",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"status": document.status.value},
|
|
)
|
|
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
|
|
session.commit()
|
|
return
|
|
|
|
if extraction.status == "unsupported":
|
|
document.status = DocumentStatus.UNSUPPORTED
|
|
if document.extension in IMAGE_EXTENSIONS:
|
|
document.handwriting_style_id = None
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json.pop("handwriting_style", None)
|
|
document.metadata_json = metadata_json
|
|
try:
|
|
delete_handwriting_style_document(str(document.id))
|
|
except Exception:
|
|
pass
|
|
document.processed_at = datetime.now(UTC)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="extraction",
|
|
event="Extraction returned unsupported",
|
|
level="warning",
|
|
document=document,
|
|
payload_json=extraction.metadata_json,
|
|
)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="worker",
|
|
event="Document processing completed",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"status": document.status.value},
|
|
)
|
|
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
|
|
session.commit()
|
|
return
|
|
|
|
if document.extension in IMAGE_EXTENSIONS:
|
|
image_text_type = document.image_text_type or ""
|
|
if image_text_type == IMAGE_TEXT_TYPE_HANDWRITING:
|
|
style_settings = read_handwriting_style_settings()
|
|
if not bool(style_settings.get("enabled", True)):
|
|
document.handwriting_style_id = None
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json.pop("handwriting_style", None)
|
|
metadata_json["handwriting_style_disabled"] = True
|
|
document.metadata_json = metadata_json
|
|
log_processing_event(
|
|
session=session,
|
|
stage="style",
|
|
event="Handwriting style clustering disabled",
|
|
level="warning",
|
|
document=document,
|
|
payload_json={
|
|
"enabled": False,
|
|
"embed_model": style_settings.get("embed_model"),
|
|
},
|
|
)
|
|
else:
|
|
try:
|
|
assignment = assign_handwriting_style(
|
|
session=session,
|
|
document=document,
|
|
image_data=data,
|
|
)
|
|
document.handwriting_style_id = assignment.style_cluster_id
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json["handwriting_style"] = {
|
|
"style_cluster_id": assignment.style_cluster_id,
|
|
"matched_existing": assignment.matched_existing,
|
|
"similarity": assignment.similarity,
|
|
"vector_distance": assignment.vector_distance,
|
|
"compared_neighbors": assignment.compared_neighbors,
|
|
"match_min_similarity": assignment.match_min_similarity,
|
|
"bootstrap_match_min_similarity": assignment.bootstrap_match_min_similarity,
|
|
}
|
|
metadata_json.pop("handwriting_style_disabled", None)
|
|
document.metadata_json = metadata_json
|
|
log_processing_event(
|
|
session=session,
|
|
stage="style",
|
|
event="Handwriting style assigned",
|
|
level="info",
|
|
document=document,
|
|
payload_json=metadata_json["handwriting_style"],
|
|
)
|
|
except Exception as style_error:
|
|
document.handwriting_style_id = None
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json["handwriting_style_error"] = str(style_error)
|
|
metadata_json.pop("handwriting_style", None)
|
|
metadata_json.pop("handwriting_style_disabled", None)
|
|
document.metadata_json = metadata_json
|
|
log_processing_event(
|
|
session=session,
|
|
stage="style",
|
|
event="Handwriting style assignment failed",
|
|
level="error",
|
|
document=document,
|
|
response_text=str(style_error),
|
|
)
|
|
else:
|
|
document.handwriting_style_id = None
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json.pop("handwriting_style", None)
|
|
metadata_json.pop("handwriting_style_disabled", None)
|
|
document.metadata_json = metadata_json
|
|
try:
|
|
delete_handwriting_style_document(str(document.id))
|
|
except Exception:
|
|
pass
|
|
|
|
if document.extension in IMAGE_EXTENSIONS:
|
|
log_processing_event(
|
|
session=session,
|
|
stage="ocr",
|
|
event="OCR response received",
|
|
level="info",
|
|
document=document,
|
|
provider_id=str(
|
|
extraction.metadata_json.get(
|
|
"transcription_provider",
|
|
extraction.metadata_json.get("image_text_type_provider", ""),
|
|
)
|
|
),
|
|
model_name=str(
|
|
extraction.metadata_json.get(
|
|
"transcription_model",
|
|
extraction.metadata_json.get("image_text_type_model", ""),
|
|
)
|
|
),
|
|
response_text=extraction.text,
|
|
payload_json={
|
|
"image_text_type": document.image_text_type,
|
|
"image_text_type_confidence": extraction.metadata_json.get("image_text_type_confidence"),
|
|
"transcription_skipped": extraction.metadata_json.get("transcription_skipped"),
|
|
"uncertainty_count": len(
|
|
extraction.metadata_json.get("transcription_uncertainties", [])
|
|
if isinstance(extraction.metadata_json.get("transcription_uncertainties", []), list)
|
|
else []
|
|
)
|
|
},
|
|
)
|
|
else:
|
|
log_processing_event(
|
|
session=session,
|
|
stage="extraction",
|
|
event="Text extraction completed",
|
|
level="info",
|
|
document=document,
|
|
response_text=extraction.text,
|
|
payload_json={"text_length": len(extraction.text)},
|
|
)
|
|
|
|
document.extracted_text = extraction.text
|
|
|
|
try:
|
|
summary_text = summarize_document(session=session, document=document)
|
|
routing_decision = classify_document_routing(session=session, document=document, summary_text=summary_text)
|
|
apply_routing_decision(document=document, decision=routing_decision, session=session)
|
|
routing_metadata = document.metadata_json.get("routing", {})
|
|
log_processing_event(
|
|
session=session,
|
|
stage="routing",
|
|
event="Routing decision applied",
|
|
level="info",
|
|
document=document,
|
|
payload_json=routing_metadata if isinstance(routing_metadata, dict) else {},
|
|
)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="indexing",
|
|
event="Typesense upsert started",
|
|
level="info",
|
|
document=document,
|
|
)
|
|
upsert_semantic_index(document=document, summary_text=summary_text)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="indexing",
|
|
event="Typesense upsert completed",
|
|
level="info",
|
|
document=document,
|
|
)
|
|
metadata_json = dict(document.metadata_json)
|
|
metadata_json["summary_text"] = summary_text[:20000]
|
|
document.metadata_json = metadata_json
|
|
except Exception as exc:
|
|
document.metadata_json = {
|
|
**document.metadata_json,
|
|
"routing_error": str(exc),
|
|
}
|
|
log_processing_event(
|
|
session=session,
|
|
stage="routing",
|
|
event="Routing or indexing failed",
|
|
level="error",
|
|
document=document,
|
|
response_text=str(exc),
|
|
)
|
|
|
|
document.status = DocumentStatus.PROCESSED
|
|
document.processed_at = datetime.now(UTC)
|
|
log_processing_event(
|
|
session=session,
|
|
stage="worker",
|
|
event="Document processing completed",
|
|
level="info",
|
|
document=document,
|
|
payload_json={"status": document.status.value},
|
|
)
|
|
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
|
|
session.commit()
|