Initial commit

This commit is contained in:
2026-02-21 09:44:18 -03:00
commit 5dfc2cbd85
65 changed files with 11989 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""Background worker package for queueing and document processing tasks."""

View File

@@ -0,0 +1,21 @@
"""Queue connection helpers used by API and worker processes."""
from redis import Redis
from rq import Queue
from app.core.config import get_settings
settings = get_settings()
def get_redis() -> Redis:
"""Creates a Redis connection from configured URL."""
return Redis.from_url(settings.redis_url)
def get_processing_queue() -> Queue:
"""Returns the named queue for document processing jobs."""
return Queue("dcm", connection=get_redis())

544
backend/app/worker/tasks.py Normal file
View File

@@ -0,0 +1,544 @@
"""Background worker tasks for extraction, indexing, and archive fan-out."""
import uuid
from datetime import UTC, datetime
from pathlib import Path
from sqlalchemy import select
from app.db.base import SessionLocal
from app.models.document import Document, DocumentStatus
from app.services.app_settings import read_handwriting_provider_settings, read_handwriting_style_settings
from app.services.extractor import (
IMAGE_EXTENSIONS,
extract_archive_members,
extract_text_content,
is_supported_for_extraction,
sniff_mime,
)
from app.services.handwriting import IMAGE_TEXT_TYPE_HANDWRITING
from app.services.handwriting_style import (
assign_handwriting_style,
delete_handwriting_style_document,
)
from app.services.processing_logs import cleanup_processing_logs, log_processing_event, set_processing_log_autocommit
from app.services.routing_pipeline import (
apply_routing_decision,
classify_document_routing,
summarize_document,
upsert_semantic_index,
)
from app.services.storage import absolute_path, compute_sha256, store_bytes, write_preview
from app.worker.queue import get_processing_queue
def _create_archive_member_document(
parent: Document,
member_name: str,
member_data: bytes,
mime_type: str,
) -> Document:
"""Creates a child document entity for a file extracted from an uploaded archive."""
extension = Path(member_name).suffix.lower()
stored_relative_path = store_bytes(member_name, member_data)
return Document(
original_filename=Path(member_name).name,
source_relative_path=f"{parent.source_relative_path}/{member_name}".strip("/"),
stored_relative_path=stored_relative_path,
mime_type=mime_type,
extension=extension,
sha256=compute_sha256(member_data),
size_bytes=len(member_data),
logical_path=parent.logical_path,
tags=list(parent.tags),
metadata_json={"origin": "archive", "parent": str(parent.id)},
is_archive_member=True,
archived_member_path=member_name,
parent_document_id=parent.id,
)
def process_document_task(document_id: str) -> None:
"""Processes one queued document and updates extraction and suggestion fields."""
with SessionLocal() as session:
set_processing_log_autocommit(session, True)
queue = get_processing_queue()
document = session.execute(
select(Document).where(Document.id == uuid.UUID(document_id))
).scalar_one_or_none()
if document is None:
return
log_processing_event(
session=session,
stage="worker",
event="Document processing started",
level="info",
document=document,
payload_json={"status": document.status.value},
)
if document.status == DocumentStatus.TRASHED:
log_processing_event(
session=session,
stage="worker",
event="Document skipped because it is trashed",
level="warning",
document=document,
)
session.commit()
return
source_path = absolute_path(document.stored_relative_path)
data = source_path.read_bytes()
if document.extension == ".zip":
child_ids: list[str] = []
log_processing_event(
session=session,
stage="archive",
event="Archive extraction started",
level="info",
document=document,
payload_json={"size_bytes": len(data)},
)
try:
members = extract_archive_members(data)
for member in members:
mime_type = sniff_mime(member.data)
child = _create_archive_member_document(
parent=document,
member_name=member.name,
member_data=member.data,
mime_type=mime_type,
)
session.add(child)
session.flush()
child_ids.append(str(child.id))
log_processing_event(
session=session,
stage="archive",
event="Archive member extracted and queued",
level="info",
document=child,
payload_json={
"parent_document_id": str(document.id),
"member_name": member.name,
"member_size_bytes": len(member.data),
"mime_type": mime_type,
},
)
document.status = DocumentStatus.PROCESSED
document.extracted_text = f"archive with {len(members)} files"
log_processing_event(
session=session,
stage="archive",
event="Archive extraction completed",
level="info",
document=document,
payload_json={"member_count": len(members)},
)
except Exception as exc:
document.status = DocumentStatus.ERROR
document.metadata_json = {**document.metadata_json, "error": str(exc)}
log_processing_event(
session=session,
stage="archive",
event="Archive extraction failed",
level="error",
document=document,
response_text=str(exc),
)
if document.status == DocumentStatus.PROCESSED:
try:
summary_text = summarize_document(session=session, document=document)
metadata_json = dict(document.metadata_json)
metadata_json["summary_text"] = summary_text[:20000]
document.metadata_json = metadata_json
routing_decision = classify_document_routing(session=session, document=document, summary_text=summary_text)
apply_routing_decision(document=document, decision=routing_decision, session=session)
routing_metadata = document.metadata_json.get("routing", {})
log_processing_event(
session=session,
stage="routing",
event="Routing decision applied",
level="info",
document=document,
payload_json=routing_metadata if isinstance(routing_metadata, dict) else {},
)
log_processing_event(
session=session,
stage="indexing",
event="Typesense upsert started",
level="info",
document=document,
)
upsert_semantic_index(document=document, summary_text=summary_text)
log_processing_event(
session=session,
stage="indexing",
event="Typesense upsert completed",
level="info",
document=document,
)
except Exception as exc:
document.metadata_json = {
**document.metadata_json,
"routing_error": str(exc),
}
log_processing_event(
session=session,
stage="routing",
event="Routing or indexing failed for archive document",
level="error",
document=document,
response_text=str(exc),
)
document.processed_at = datetime.now(UTC)
log_processing_event(
session=session,
stage="worker",
event="Document processing completed",
level="info",
document=document,
payload_json={"status": document.status.value},
)
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
session.commit()
for child_id in child_ids:
queue.enqueue("app.worker.tasks.process_document_task", child_id)
for child_id in child_ids:
log_processing_event(
session=session,
stage="archive",
event="Archive child job enqueued",
level="info",
document_id=uuid.UUID(child_id),
payload_json={"parent_document_id": str(document.id)},
)
session.commit()
return
if not is_supported_for_extraction(document.extension, document.mime_type):
document.status = DocumentStatus.UNSUPPORTED
document.processed_at = datetime.now(UTC)
log_processing_event(
session=session,
stage="extraction",
event="Document type unsupported for extraction",
level="warning",
document=document,
payload_json={"extension": document.extension, "mime_type": document.mime_type},
)
log_processing_event(
session=session,
stage="worker",
event="Document processing completed",
level="info",
document=document,
payload_json={"status": document.status.value},
)
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
session.commit()
return
if document.extension in IMAGE_EXTENSIONS:
ocr_settings = read_handwriting_provider_settings()
log_processing_event(
session=session,
stage="ocr",
event="OCR request started",
level="info",
document=document,
provider_id=str(ocr_settings.get("provider_id", "")),
model_name=str(ocr_settings.get("openai_model", "")),
prompt_text=str(ocr_settings.get("prompt", "")),
payload_json={"mime_type": document.mime_type},
)
else:
log_processing_event(
session=session,
stage="extraction",
event="Text extraction started",
level="info",
document=document,
payload_json={"extension": document.extension, "mime_type": document.mime_type},
)
extraction = extract_text_content(document.original_filename, data, document.mime_type)
if extraction.preview_bytes and extraction.preview_suffix:
preview_relative_path = write_preview(str(document.id), extraction.preview_bytes, extraction.preview_suffix)
document.metadata_json = {**document.metadata_json, "preview_relative_path": preview_relative_path}
document.preview_available = True
log_processing_event(
session=session,
stage="extraction",
event="Preview generated",
level="info",
document=document,
payload_json={"preview_relative_path": preview_relative_path},
)
if extraction.metadata_json:
document.metadata_json = {**document.metadata_json, **extraction.metadata_json}
if document.extension in IMAGE_EXTENSIONS:
image_text_type = extraction.metadata_json.get("image_text_type")
if isinstance(image_text_type, str) and image_text_type.strip():
document.image_text_type = image_text_type.strip()
else:
document.image_text_type = None
else:
document.image_text_type = None
document.handwriting_style_id = None
if extraction.status == "error":
document.status = DocumentStatus.ERROR
document.metadata_json = {**document.metadata_json, "error": "extraction_failed"}
if document.extension in IMAGE_EXTENSIONS:
document.handwriting_style_id = None
metadata_json = dict(document.metadata_json)
metadata_json.pop("handwriting_style", None)
document.metadata_json = metadata_json
try:
delete_handwriting_style_document(str(document.id))
except Exception:
pass
document.processed_at = datetime.now(UTC)
log_processing_event(
session=session,
stage="extraction",
event="Extraction failed",
level="error",
document=document,
response_text=str(extraction.metadata_json.get("error", "extraction_failed")),
payload_json=extraction.metadata_json,
)
if "transcription_error" in extraction.metadata_json:
log_processing_event(
session=session,
stage="ocr",
event="OCR request failed",
level="error",
document=document,
response_text=str(extraction.metadata_json.get("transcription_error", "")),
)
log_processing_event(
session=session,
stage="worker",
event="Document processing completed",
level="info",
document=document,
payload_json={"status": document.status.value},
)
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
session.commit()
return
if extraction.status == "unsupported":
document.status = DocumentStatus.UNSUPPORTED
if document.extension in IMAGE_EXTENSIONS:
document.handwriting_style_id = None
metadata_json = dict(document.metadata_json)
metadata_json.pop("handwriting_style", None)
document.metadata_json = metadata_json
try:
delete_handwriting_style_document(str(document.id))
except Exception:
pass
document.processed_at = datetime.now(UTC)
log_processing_event(
session=session,
stage="extraction",
event="Extraction returned unsupported",
level="warning",
document=document,
payload_json=extraction.metadata_json,
)
log_processing_event(
session=session,
stage="worker",
event="Document processing completed",
level="info",
document=document,
payload_json={"status": document.status.value},
)
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
session.commit()
return
if document.extension in IMAGE_EXTENSIONS:
image_text_type = document.image_text_type or ""
if image_text_type == IMAGE_TEXT_TYPE_HANDWRITING:
style_settings = read_handwriting_style_settings()
if not bool(style_settings.get("enabled", True)):
document.handwriting_style_id = None
metadata_json = dict(document.metadata_json)
metadata_json.pop("handwriting_style", None)
metadata_json["handwriting_style_disabled"] = True
document.metadata_json = metadata_json
log_processing_event(
session=session,
stage="style",
event="Handwriting style clustering disabled",
level="warning",
document=document,
payload_json={
"enabled": False,
"embed_model": style_settings.get("embed_model"),
},
)
else:
try:
assignment = assign_handwriting_style(
session=session,
document=document,
image_data=data,
)
document.handwriting_style_id = assignment.style_cluster_id
metadata_json = dict(document.metadata_json)
metadata_json["handwriting_style"] = {
"style_cluster_id": assignment.style_cluster_id,
"matched_existing": assignment.matched_existing,
"similarity": assignment.similarity,
"vector_distance": assignment.vector_distance,
"compared_neighbors": assignment.compared_neighbors,
"match_min_similarity": assignment.match_min_similarity,
"bootstrap_match_min_similarity": assignment.bootstrap_match_min_similarity,
}
metadata_json.pop("handwriting_style_disabled", None)
document.metadata_json = metadata_json
log_processing_event(
session=session,
stage="style",
event="Handwriting style assigned",
level="info",
document=document,
payload_json=metadata_json["handwriting_style"],
)
except Exception as style_error:
document.handwriting_style_id = None
metadata_json = dict(document.metadata_json)
metadata_json["handwriting_style_error"] = str(style_error)
metadata_json.pop("handwriting_style", None)
metadata_json.pop("handwriting_style_disabled", None)
document.metadata_json = metadata_json
log_processing_event(
session=session,
stage="style",
event="Handwriting style assignment failed",
level="error",
document=document,
response_text=str(style_error),
)
else:
document.handwriting_style_id = None
metadata_json = dict(document.metadata_json)
metadata_json.pop("handwriting_style", None)
metadata_json.pop("handwriting_style_disabled", None)
document.metadata_json = metadata_json
try:
delete_handwriting_style_document(str(document.id))
except Exception:
pass
if document.extension in IMAGE_EXTENSIONS:
log_processing_event(
session=session,
stage="ocr",
event="OCR response received",
level="info",
document=document,
provider_id=str(
extraction.metadata_json.get(
"transcription_provider",
extraction.metadata_json.get("image_text_type_provider", ""),
)
),
model_name=str(
extraction.metadata_json.get(
"transcription_model",
extraction.metadata_json.get("image_text_type_model", ""),
)
),
response_text=extraction.text,
payload_json={
"image_text_type": document.image_text_type,
"image_text_type_confidence": extraction.metadata_json.get("image_text_type_confidence"),
"transcription_skipped": extraction.metadata_json.get("transcription_skipped"),
"uncertainty_count": len(
extraction.metadata_json.get("transcription_uncertainties", [])
if isinstance(extraction.metadata_json.get("transcription_uncertainties", []), list)
else []
)
},
)
else:
log_processing_event(
session=session,
stage="extraction",
event="Text extraction completed",
level="info",
document=document,
response_text=extraction.text,
payload_json={"text_length": len(extraction.text)},
)
document.extracted_text = extraction.text
try:
summary_text = summarize_document(session=session, document=document)
routing_decision = classify_document_routing(session=session, document=document, summary_text=summary_text)
apply_routing_decision(document=document, decision=routing_decision, session=session)
routing_metadata = document.metadata_json.get("routing", {})
log_processing_event(
session=session,
stage="routing",
event="Routing decision applied",
level="info",
document=document,
payload_json=routing_metadata if isinstance(routing_metadata, dict) else {},
)
log_processing_event(
session=session,
stage="indexing",
event="Typesense upsert started",
level="info",
document=document,
)
upsert_semantic_index(document=document, summary_text=summary_text)
log_processing_event(
session=session,
stage="indexing",
event="Typesense upsert completed",
level="info",
document=document,
)
metadata_json = dict(document.metadata_json)
metadata_json["summary_text"] = summary_text[:20000]
document.metadata_json = metadata_json
except Exception as exc:
document.metadata_json = {
**document.metadata_json,
"routing_error": str(exc),
}
log_processing_event(
session=session,
stage="routing",
event="Routing or indexing failed",
level="error",
document=document,
response_text=str(exc),
)
document.status = DocumentStatus.PROCESSED
document.processed_at = datetime.now(UTC)
log_processing_event(
session=session,
stage="worker",
event="Document processing completed",
level="info",
document=document,
payload_json={"status": document.status.value},
)
cleanup_processing_logs(session=session, keep_document_sessions=2, keep_unbound_entries=80)
session.commit()