"""Background worker tasks for extraction, indexing, and archive fan-out.""" import uuid from datetime import UTC, datetime from pathlib import Path from sqlalchemy import select from sqlalchemy.orm import Session from app.db.base import SessionLocal from app.models.document import Document, DocumentStatus from app.services.app_settings import ( read_handwriting_provider_settings, read_handwriting_style_settings, read_processing_log_retention_settings, ) from app.services.extractor import ( IMAGE_EXTENSIONS, extract_archive_members, extract_text_content, is_supported_for_extraction, sniff_mime, ) from app.services.handwriting import IMAGE_TEXT_TYPE_HANDWRITING from app.services.handwriting_style import ( assign_handwriting_style, delete_handwriting_style_document, ) from app.services.processing_logs import cleanup_processing_logs, log_processing_event, set_processing_log_autocommit from app.services.routing_pipeline import ( apply_routing_decision, classify_document_routing, summarize_document, upsert_semantic_index, ) from app.services.storage import absolute_path, compute_sha256, store_bytes, write_preview from app.worker.queue import get_processing_queue def _cleanup_processing_logs_with_settings(session: Session) -> None: """Applies configured processing log retention while trimming old log entries.""" retention = read_processing_log_retention_settings() cleanup_processing_logs( session=session, keep_document_sessions=int(retention.get("keep_document_sessions", 2)), keep_unbound_entries=int(retention.get("keep_unbound_entries", 80)), ) def _create_archive_member_document( parent: Document, member_name: str, member_data: bytes, mime_type: str, ) -> Document: """Creates a child document entity for a file extracted from an uploaded archive.""" extension = Path(member_name).suffix.lower() stored_relative_path = store_bytes(member_name, member_data) return Document( original_filename=Path(member_name).name, source_relative_path=f"{parent.source_relative_path}/{member_name}".strip("/"), stored_relative_path=stored_relative_path, mime_type=mime_type, extension=extension, sha256=compute_sha256(member_data), size_bytes=len(member_data), logical_path=parent.logical_path, tags=list(parent.tags), metadata_json={"origin": "archive", "parent": str(parent.id)}, is_archive_member=True, archived_member_path=member_name, parent_document_id=parent.id, ) def process_document_task(document_id: str) -> None: """Processes one queued document and updates extraction and suggestion fields.""" with SessionLocal() as session: set_processing_log_autocommit(session, True) queue = get_processing_queue() document = session.execute( select(Document).where(Document.id == uuid.UUID(document_id)) ).scalar_one_or_none() if document is None: return log_processing_event( session=session, stage="worker", event="Document processing started", level="info", document=document, payload_json={"status": document.status.value}, ) if document.status == DocumentStatus.TRASHED: log_processing_event( session=session, stage="worker", event="Document skipped because it is trashed", level="warning", document=document, ) session.commit() return source_path = absolute_path(document.stored_relative_path) data = source_path.read_bytes() if document.extension == ".zip": child_ids: list[str] = [] log_processing_event( session=session, stage="archive", event="Archive extraction started", level="info", document=document, payload_json={"size_bytes": len(data)}, ) try: members = extract_archive_members(data) for member in members: mime_type = sniff_mime(member.data) child = _create_archive_member_document( parent=document, member_name=member.name, member_data=member.data, mime_type=mime_type, ) session.add(child) session.flush() child_ids.append(str(child.id)) log_processing_event( session=session, stage="archive", event="Archive member extracted and queued", level="info", document=child, payload_json={ "parent_document_id": str(document.id), "member_name": member.name, "member_size_bytes": len(member.data), "mime_type": mime_type, }, ) document.status = DocumentStatus.PROCESSED document.extracted_text = f"archive with {len(members)} files" log_processing_event( session=session, stage="archive", event="Archive extraction completed", level="info", document=document, payload_json={"member_count": len(members)}, ) except Exception as exc: document.status = DocumentStatus.ERROR document.metadata_json = {**document.metadata_json, "error": str(exc)} log_processing_event( session=session, stage="archive", event="Archive extraction failed", level="error", document=document, response_text=str(exc), ) if document.status == DocumentStatus.PROCESSED: try: summary_text = summarize_document(session=session, document=document) metadata_json = dict(document.metadata_json) metadata_json["summary_text"] = summary_text[:20000] document.metadata_json = metadata_json routing_decision = classify_document_routing(session=session, document=document, summary_text=summary_text) apply_routing_decision(document=document, decision=routing_decision, session=session) routing_metadata = document.metadata_json.get("routing", {}) log_processing_event( session=session, stage="routing", event="Routing decision applied", level="info", document=document, payload_json=routing_metadata if isinstance(routing_metadata, dict) else {}, ) log_processing_event( session=session, stage="indexing", event="Typesense upsert started", level="info", document=document, ) upsert_semantic_index(document=document, summary_text=summary_text) log_processing_event( session=session, stage="indexing", event="Typesense upsert completed", level="info", document=document, ) except Exception as exc: document.metadata_json = { **document.metadata_json, "routing_error": str(exc), } log_processing_event( session=session, stage="routing", event="Routing or indexing failed for archive document", level="error", document=document, response_text=str(exc), ) document.processed_at = datetime.now(UTC) log_processing_event( session=session, stage="worker", event="Document processing completed", level="info", document=document, payload_json={"status": document.status.value}, ) _cleanup_processing_logs_with_settings(session=session) session.commit() for child_id in child_ids: queue.enqueue("app.worker.tasks.process_document_task", child_id) for child_id in child_ids: log_processing_event( session=session, stage="archive", event="Archive child job enqueued", level="info", document_id=uuid.UUID(child_id), payload_json={"parent_document_id": str(document.id)}, ) session.commit() return if not is_supported_for_extraction(document.extension, document.mime_type): document.status = DocumentStatus.UNSUPPORTED document.processed_at = datetime.now(UTC) log_processing_event( session=session, stage="extraction", event="Document type unsupported for extraction", level="warning", document=document, payload_json={"extension": document.extension, "mime_type": document.mime_type}, ) log_processing_event( session=session, stage="worker", event="Document processing completed", level="info", document=document, payload_json={"status": document.status.value}, ) _cleanup_processing_logs_with_settings(session=session) session.commit() return if document.extension in IMAGE_EXTENSIONS: ocr_settings = read_handwriting_provider_settings() log_processing_event( session=session, stage="ocr", event="OCR request started", level="info", document=document, provider_id=str(ocr_settings.get("provider_id", "")), model_name=str(ocr_settings.get("openai_model", "")), prompt_text=str(ocr_settings.get("prompt", "")), payload_json={"mime_type": document.mime_type}, ) else: log_processing_event( session=session, stage="extraction", event="Text extraction started", level="info", document=document, payload_json={"extension": document.extension, "mime_type": document.mime_type}, ) extraction = extract_text_content(document.original_filename, data, document.mime_type) if extraction.preview_bytes and extraction.preview_suffix: preview_relative_path = write_preview(str(document.id), extraction.preview_bytes, extraction.preview_suffix) document.metadata_json = {**document.metadata_json, "preview_relative_path": preview_relative_path} document.preview_available = True log_processing_event( session=session, stage="extraction", event="Preview generated", level="info", document=document, payload_json={"preview_relative_path": preview_relative_path}, ) if extraction.metadata_json: document.metadata_json = {**document.metadata_json, **extraction.metadata_json} if document.extension in IMAGE_EXTENSIONS: image_text_type = extraction.metadata_json.get("image_text_type") if isinstance(image_text_type, str) and image_text_type.strip(): document.image_text_type = image_text_type.strip() else: document.image_text_type = None else: document.image_text_type = None document.handwriting_style_id = None if extraction.status == "error": document.status = DocumentStatus.ERROR document.metadata_json = {**document.metadata_json, "error": "extraction_failed"} if document.extension in IMAGE_EXTENSIONS: document.handwriting_style_id = None metadata_json = dict(document.metadata_json) metadata_json.pop("handwriting_style", None) document.metadata_json = metadata_json try: delete_handwriting_style_document(str(document.id)) except Exception: pass document.processed_at = datetime.now(UTC) log_processing_event( session=session, stage="extraction", event="Extraction failed", level="error", document=document, response_text=str(extraction.metadata_json.get("error", "extraction_failed")), payload_json=extraction.metadata_json, ) if "transcription_error" in extraction.metadata_json: log_processing_event( session=session, stage="ocr", event="OCR request failed", level="error", document=document, response_text=str(extraction.metadata_json.get("transcription_error", "")), ) log_processing_event( session=session, stage="worker", event="Document processing completed", level="info", document=document, payload_json={"status": document.status.value}, ) _cleanup_processing_logs_with_settings(session=session) session.commit() return if extraction.status == "unsupported": document.status = DocumentStatus.UNSUPPORTED if document.extension in IMAGE_EXTENSIONS: document.handwriting_style_id = None metadata_json = dict(document.metadata_json) metadata_json.pop("handwriting_style", None) document.metadata_json = metadata_json try: delete_handwriting_style_document(str(document.id)) except Exception: pass document.processed_at = datetime.now(UTC) log_processing_event( session=session, stage="extraction", event="Extraction returned unsupported", level="warning", document=document, payload_json=extraction.metadata_json, ) log_processing_event( session=session, stage="worker", event="Document processing completed", level="info", document=document, payload_json={"status": document.status.value}, ) _cleanup_processing_logs_with_settings(session=session) session.commit() return if document.extension in IMAGE_EXTENSIONS: image_text_type = document.image_text_type or "" if image_text_type == IMAGE_TEXT_TYPE_HANDWRITING: style_settings = read_handwriting_style_settings() if not bool(style_settings.get("enabled", True)): document.handwriting_style_id = None metadata_json = dict(document.metadata_json) metadata_json.pop("handwriting_style", None) metadata_json["handwriting_style_disabled"] = True document.metadata_json = metadata_json log_processing_event( session=session, stage="style", event="Handwriting style clustering disabled", level="warning", document=document, payload_json={ "enabled": False, "embed_model": style_settings.get("embed_model"), }, ) else: try: assignment = assign_handwriting_style( session=session, document=document, image_data=data, ) document.handwriting_style_id = assignment.style_cluster_id metadata_json = dict(document.metadata_json) metadata_json["handwriting_style"] = { "style_cluster_id": assignment.style_cluster_id, "matched_existing": assignment.matched_existing, "similarity": assignment.similarity, "vector_distance": assignment.vector_distance, "compared_neighbors": assignment.compared_neighbors, "match_min_similarity": assignment.match_min_similarity, "bootstrap_match_min_similarity": assignment.bootstrap_match_min_similarity, } metadata_json.pop("handwriting_style_disabled", None) document.metadata_json = metadata_json log_processing_event( session=session, stage="style", event="Handwriting style assigned", level="info", document=document, payload_json=metadata_json["handwriting_style"], ) except Exception as style_error: document.handwriting_style_id = None metadata_json = dict(document.metadata_json) metadata_json["handwriting_style_error"] = str(style_error) metadata_json.pop("handwriting_style", None) metadata_json.pop("handwriting_style_disabled", None) document.metadata_json = metadata_json log_processing_event( session=session, stage="style", event="Handwriting style assignment failed", level="error", document=document, response_text=str(style_error), ) else: document.handwriting_style_id = None metadata_json = dict(document.metadata_json) metadata_json.pop("handwriting_style", None) metadata_json.pop("handwriting_style_disabled", None) document.metadata_json = metadata_json try: delete_handwriting_style_document(str(document.id)) except Exception: pass if document.extension in IMAGE_EXTENSIONS: log_processing_event( session=session, stage="ocr", event="OCR response received", level="info", document=document, provider_id=str( extraction.metadata_json.get( "transcription_provider", extraction.metadata_json.get("image_text_type_provider", ""), ) ), model_name=str( extraction.metadata_json.get( "transcription_model", extraction.metadata_json.get("image_text_type_model", ""), ) ), response_text=extraction.text, payload_json={ "image_text_type": document.image_text_type, "image_text_type_confidence": extraction.metadata_json.get("image_text_type_confidence"), "transcription_skipped": extraction.metadata_json.get("transcription_skipped"), "uncertainty_count": len( extraction.metadata_json.get("transcription_uncertainties", []) if isinstance(extraction.metadata_json.get("transcription_uncertainties", []), list) else [] ) }, ) else: log_processing_event( session=session, stage="extraction", event="Text extraction completed", level="info", document=document, response_text=extraction.text, payload_json={"text_length": len(extraction.text)}, ) document.extracted_text = extraction.text try: summary_text = summarize_document(session=session, document=document) routing_decision = classify_document_routing(session=session, document=document, summary_text=summary_text) apply_routing_decision(document=document, decision=routing_decision, session=session) routing_metadata = document.metadata_json.get("routing", {}) log_processing_event( session=session, stage="routing", event="Routing decision applied", level="info", document=document, payload_json=routing_metadata if isinstance(routing_metadata, dict) else {}, ) log_processing_event( session=session, stage="indexing", event="Typesense upsert started", level="info", document=document, ) upsert_semantic_index(document=document, summary_text=summary_text) log_processing_event( session=session, stage="indexing", event="Typesense upsert completed", level="info", document=document, ) metadata_json = dict(document.metadata_json) metadata_json["summary_text"] = summary_text[:20000] document.metadata_json = metadata_json except Exception as exc: document.metadata_json = { **document.metadata_json, "routing_error": str(exc), } log_processing_event( session=session, stage="routing", event="Routing or indexing failed", level="error", document=document, response_text=str(exc), ) document.status = DocumentStatus.PROCESSED document.processed_at = datetime.now(UTC) log_processing_event( session=session, stage="worker", event="Document processing completed", level="info", document=document, payload_json={"status": document.status.value}, ) _cleanup_processing_logs_with_settings(session=session) session.commit()