Initial commit

2026-02-21 09:44:18 -03:00
commit 5dfc2cbd85
65 changed files with 11989 additions and 0 deletions
@@ -0,0 +1 @@
+"""API package containing route modules and router registration."""
@@ -0,0 +1,17 @@
+"""API router registration for all HTTP route modules."""
+
+from fastapi import APIRouter
+
+from app.api.routes_documents import router as documents_router
+from app.api.routes_health import router as health_router
+from app.api.routes_processing_logs import router as processing_logs_router
+from app.api.routes_search import router as search_router
+from app.api.routes_settings import router as settings_router
+
+
+api_router = APIRouter()
+api_router.include_router(health_router)
+api_router.include_router(documents_router, prefix="/documents", tags=["documents"])
+api_router.include_router(processing_logs_router, prefix="/processing/logs", tags=["processing-logs"])
+api_router.include_router(search_router, prefix="/search", tags=["search"])
+api_router.include_router(settings_router, prefix="/settings", tags=["settings"])
@@ -0,0 +1,725 @@
+"""Document CRUD, lifecycle, metadata, file access, and content export endpoints."""
+
+import io
+import re
+import unicodedata
+import zipfile
+from datetime import datetime, time
+from pathlib import Path
+from typing import Annotated, Literal
+from uuid import UUID
+
+from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
+from fastapi.responses import FileResponse, Response, StreamingResponse
+from sqlalchemy import or_, func, select
+from sqlalchemy.orm import Session
+
+from app.services.app_settings import read_predefined_paths_settings, read_predefined_tags_settings
+from app.db.base import get_session
+from app.models.document import Document, DocumentStatus
+from app.schemas.documents import (
+    ContentExportRequest,
+    DocumentDetailResponse,
+    DocumentResponse,
+    DocumentsListResponse,
+    DocumentUpdateRequest,
+    UploadConflict,
+    UploadResponse,
+)
+from app.services.extractor import sniff_mime
+from app.services.handwriting_style import delete_many_handwriting_style_documents
+from app.services.processing_logs import log_processing_event, set_processing_log_autocommit
+from app.services.storage import absolute_path, compute_sha256, store_bytes
+from app.services.typesense_index import delete_many_documents_index, upsert_document_index
+from app.worker.queue import get_processing_queue
+
+
+router = APIRouter()
+
+
+def _parse_csv(value: str | None) -> list[str]:
+    """Parses comma-separated query values into a normalized non-empty list."""
+
+    if not value:
+        return []
+    return [part.strip() for part in value.split(",") if part.strip()]
+
+
+def _parse_date(value: str | None) -> datetime | None:
+    """Parses ISO date strings into UTC-naive midnight datetimes."""
+
+    if not value:
+        return None
+    try:
+        parsed = datetime.fromisoformat(value)
+        return parsed
+    except ValueError:
+        pass
+    try:
+        date_value = datetime.strptime(value, "%Y-%m-%d").date()
+        return datetime.combine(date_value, time.min)
+    except ValueError:
+        return None
+
+
+def _apply_discovery_filters(
+    statement,
+    *,
+    path_filter: str | None,
+    tag_filter: str | None,
+    type_filter: str | None,
+    processed_from: str | None,
+    processed_to: str | None,
+):
+    """Applies optional path/tag/type/date filters to list and search statements."""
+
+    if path_filter and path_filter.strip():
+        statement = statement.where(Document.logical_path.ilike(f"{path_filter.strip()}%"))
+
+    tags = _parse_csv(tag_filter)
+    if tags:
+        statement = statement.where(Document.tags.overlap(tags))
+
+    types = _parse_csv(type_filter)
+    if types:
+        type_clauses = []
+        for value in types:
+            lowered = value.lower()
+            type_clauses.append(Document.extension.ilike(lowered))
+            type_clauses.append(Document.mime_type.ilike(lowered))
+            type_clauses.append(Document.image_text_type.ilike(lowered))
+        statement = statement.where(or_(*type_clauses))
+
+    processed_from_dt = _parse_date(processed_from)
+    if processed_from_dt is not None:
+        statement = statement.where(Document.processed_at.is_not(None), Document.processed_at >= processed_from_dt)
+
+    processed_to_dt = _parse_date(processed_to)
+    if processed_to_dt is not None:
+        statement = statement.where(Document.processed_at.is_not(None), Document.processed_at <= processed_to_dt)
+
+    return statement
+
+
+def _summary_for_index(document: Document) -> str:
+    """Resolves best-available summary text for semantic index updates outside worker pipeline."""
+
+    candidate = document.metadata_json.get("summary_text")
+    if isinstance(candidate, str) and candidate.strip():
+        return candidate.strip()
+    extracted = document.extracted_text.strip()
+    if extracted:
+        return extracted[:12000]
+    return f"{document.original_filename}\n{document.mime_type}\n{document.logical_path}"
+
+
+def _normalize_tags(raw_tags: str | None) -> list[str]:
+    """Parses comma-separated tags into a cleaned unique list."""
+
+    if not raw_tags:
+        return []
+    tags = [tag.strip() for tag in raw_tags.split(",") if tag.strip()]
+    return list(dict.fromkeys(tags))[:50]
+
+
+def _sanitize_filename(filename: str) -> str:
+    """Normalizes user-supplied filenames while preserving readability and extensions."""
+
+    base = filename.strip().replace("\\", " ").replace("/", " ")
+    base = re.sub(r"\s+", " ", base)
+    return base[:512] or "document"
+
+
+def _slugify_segment(value: str) -> str:
+    """Creates a filesystem-safe slug for path segments and markdown file names."""
+
+    normalized = unicodedata.normalize("NFKD", value)
+    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
+    cleaned = re.sub(r"[^a-zA-Z0-9._ -]+", "", ascii_text).strip()
+    compact = re.sub(r"\s+", "-", cleaned)
+    compact = compact.strip(".-_")
+    return compact[:120] or "document"
+
+
+def _markdown_for_document(document: Document) -> str:
+    """Builds a markdown representation of extracted document content and metadata."""
+
+    lines = [
+        f"# {document.original_filename}",
+        "",
+        f"- Document ID: `{document.id}`",
+        f"- Logical Path: `{document.logical_path}`",
+        f"- Source Path: `{document.source_relative_path}`",
+        f"- Tags: {', '.join(document.tags) if document.tags else '(none)' }",
+        "",
+        "## Extracted Content",
+        "",
+    ]
+
+    if document.extracted_text.strip():
+        lines.append(document.extracted_text)
+    else:
+        lines.append("_No extracted text available for this document._")
+
+    return "\n".join(lines).strip() + "\n"
+
+
+def _markdown_filename(document: Document) -> str:
+    """Builds a deterministic markdown filename for a single document export."""
+
+    stem = Path(document.original_filename).stem or document.original_filename
+    slug = _slugify_segment(stem)
+    return f"{slug}-{str(document.id)[:8]}.md"
+
+
+def _zip_entry_name(document: Document, used_names: set[str]) -> str:
+    """Builds a unique zip entry path for a document markdown export."""
+
+    path_segments = [segment for segment in document.logical_path.split("/") if segment]
+    sanitized_segments = [_slugify_segment(segment) for segment in path_segments]
+    filename = _markdown_filename(document)
+
+    base_entry = "/".join([*sanitized_segments, filename]) if sanitized_segments else filename
+    entry = base_entry
+    suffix = 1
+    while entry in used_names:
+        stem = Path(filename).stem
+        ext = Path(filename).suffix
+        candidate = f"{stem}-{suffix}{ext}"
+        entry = "/".join([*sanitized_segments, candidate]) if sanitized_segments else candidate
+        suffix += 1
+    used_names.add(entry)
+    return entry
+
+
+def _resolve_previous_status(metadata_json: dict, fallback_status: DocumentStatus) -> DocumentStatus:
+    """Resolves the status to restore from trash using recorded metadata."""
+
+    raw_status = metadata_json.get("status_before_trash")
+    if isinstance(raw_status, str):
+        try:
+            parsed = DocumentStatus(raw_status)
+            if parsed != DocumentStatus.TRASHED:
+                return parsed
+        except ValueError:
+            pass
+    return fallback_status
+
+
+def _build_document_list_statement(
+    only_trashed: bool,
+    include_trashed: bool,
+    path_prefix: str | None,
+):
+    """Builds a base SQLAlchemy select statement with lifecycle and path filters."""
+
+    statement = select(Document)
+    if only_trashed:
+        statement = statement.where(Document.status == DocumentStatus.TRASHED)
+    elif not include_trashed:
+        statement = statement.where(Document.status != DocumentStatus.TRASHED)
+
+    if path_prefix:
+        trimmed_prefix = path_prefix.strip()
+        if trimmed_prefix:
+            statement = statement.where(Document.logical_path.ilike(f"{trimmed_prefix}%"))
+
+    return statement
+
+
+def _collect_document_tree(session: Session, root_document_id: UUID) -> list[tuple[int, Document]]:
+    """Collects a document and all descendants for recursive permanent deletion."""
+
+    queue: list[tuple[UUID, int]] = [(root_document_id, 0)]
+    visited: set[UUID] = set()
+    collected: list[tuple[int, Document]] = []
+
+    while queue:
+        current_id, depth = queue.pop(0)
+        if current_id in visited:
+            continue
+        visited.add(current_id)
+
+        document = session.execute(select(Document).where(Document.id == current_id)).scalar_one_or_none()
+        if document is None:
+            continue
+
+        collected.append((depth, document))
+        child_ids = session.execute(
+            select(Document.id).where(Document.parent_document_id == current_id)
+        ).scalars().all()
+        for child_id in child_ids:
+            queue.append((child_id, depth + 1))
+
+    collected.sort(key=lambda item: item[0], reverse=True)
+    return collected
+
+
+@router.get("", response_model=DocumentsListResponse)
+def list_documents(
+    offset: int = Query(default=0, ge=0),
+    limit: int = Query(default=50, ge=1, le=200),
+    include_trashed: bool = Query(default=False),
+    only_trashed: bool = Query(default=False),
+    path_prefix: str | None = Query(default=None),
+    path_filter: str | None = Query(default=None),
+    tag_filter: str | None = Query(default=None),
+    type_filter: str | None = Query(default=None),
+    processed_from: str | None = Query(default=None),
+    processed_to: str | None = Query(default=None),
+    session: Session = Depends(get_session),
+) -> DocumentsListResponse:
+    """Returns paginated documents ordered by newest upload timestamp."""
+
+    base_statement = _build_document_list_statement(
+        only_trashed=only_trashed,
+        include_trashed=include_trashed,
+        path_prefix=path_prefix,
+    )
+    base_statement = _apply_discovery_filters(
+        base_statement,
+        path_filter=path_filter,
+        tag_filter=tag_filter,
+        type_filter=type_filter,
+        processed_from=processed_from,
+        processed_to=processed_to,
+    )
+
+    statement = base_statement.order_by(Document.created_at.desc()).offset(offset).limit(limit)
+    items = session.execute(statement).scalars().all()
+
+    count_statement = select(func.count()).select_from(base_statement.subquery())
+    total = session.execute(count_statement).scalar_one()
+
+    return DocumentsListResponse(total=total, items=[DocumentResponse.model_validate(item) for item in items])
+
+
+@router.get("/tags")
+def list_tags(
+    include_trashed: bool = Query(default=False),
+    session: Session = Depends(get_session),
+) -> dict[str, list[str]]:
+    """Returns distinct tags currently assigned across all matching documents."""
+
+    statement = select(Document.tags)
+    if not include_trashed:
+        statement = statement.where(Document.status != DocumentStatus.TRASHED)
+
+    rows = session.execute(statement).scalars().all()
+    tags = {tag for row in rows for tag in row if tag}
+    tags.update(
+        str(item.get("value", "")).strip()
+        for item in read_predefined_tags_settings()
+        if str(item.get("value", "")).strip()
+    )
+    tags = sorted(tags)
+    return {"tags": tags}
+
+
+@router.get("/paths")
+def list_paths(
+    include_trashed: bool = Query(default=False),
+    session: Session = Depends(get_session),
+) -> dict[str, list[str]]:
+    """Returns distinct logical paths currently assigned across all matching documents."""
+
+    statement = select(Document.logical_path)
+    if not include_trashed:
+        statement = statement.where(Document.status != DocumentStatus.TRASHED)
+
+    rows = session.execute(statement).scalars().all()
+    paths = {row for row in rows if row}
+    paths.update(
+        str(item.get("value", "")).strip()
+        for item in read_predefined_paths_settings()
+        if str(item.get("value", "")).strip()
+    )
+    paths = sorted(paths)
+    return {"paths": paths}
+
+
+@router.get("/types")
+def list_types(
+    include_trashed: bool = Query(default=False),
+    session: Session = Depends(get_session),
+) -> dict[str, list[str]]:
+    """Returns distinct document type values from extension, MIME, and image text type."""
+
+    statement = select(Document.extension, Document.mime_type, Document.image_text_type)
+    if not include_trashed:
+        statement = statement.where(Document.status != DocumentStatus.TRASHED)
+    rows = session.execute(statement).all()
+    values: set[str] = set()
+    for extension, mime_type, image_text_type in rows:
+        for candidate in (extension, mime_type, image_text_type):
+            normalized = str(candidate).strip().lower() if isinstance(candidate, str) else ""
+            if normalized:
+                values.add(normalized)
+    return {"types": sorted(values)}
+
+
+@router.post("/content-md/export")
+def export_contents_markdown(
+    payload: ContentExportRequest,
+    session: Session = Depends(get_session),
+) -> StreamingResponse:
+    """Exports extracted contents for selected documents as individual markdown files in a ZIP archive."""
+
+    has_document_ids = len(payload.document_ids) > 0
+    has_path_prefix = bool(payload.path_prefix and payload.path_prefix.strip())
+    if not has_document_ids and not has_path_prefix:
+        raise HTTPException(status_code=400, detail="Provide document_ids or path_prefix for export")
+
+    statement = select(Document)
+    if has_document_ids:
+        statement = statement.where(Document.id.in_(payload.document_ids))
+    if has_path_prefix:
+        statement = statement.where(Document.logical_path.ilike(f"{payload.path_prefix.strip()}%"))
+    if payload.only_trashed:
+        statement = statement.where(Document.status == DocumentStatus.TRASHED)
+    elif not payload.include_trashed:
+        statement = statement.where(Document.status != DocumentStatus.TRASHED)
+
+    documents = session.execute(statement.order_by(Document.logical_path.asc(), Document.created_at.asc())).scalars().all()
+    if not documents:
+        raise HTTPException(status_code=404, detail="No matching documents found for export")
+
+    archive_buffer = io.BytesIO()
+    used_entries: set[str] = set()
+    with zipfile.ZipFile(archive_buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
+        for document in documents:
+            entry_name = _zip_entry_name(document, used_entries)
+            archive.writestr(entry_name, _markdown_for_document(document))
+
+    archive_buffer.seek(0)
+    headers = {"Content-Disposition": 'attachment; filename="document-contents-md.zip"'}
+    return StreamingResponse(archive_buffer, media_type="application/zip", headers=headers)
+
+
+@router.get("/{document_id}", response_model=DocumentDetailResponse)
+def get_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentDetailResponse:
+    """Returns one document by unique identifier."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+    return DocumentDetailResponse.model_validate(document)
+
+
+@router.get("/{document_id}/download")
+def download_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
+    """Downloads original document bytes for the requested document identifier."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+    file_path = absolute_path(document.stored_relative_path)
+    return FileResponse(path=file_path, filename=document.original_filename, media_type=document.mime_type)
+
+
+@router.get("/{document_id}/preview")
+def preview_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
+    """Streams the original document inline when browser rendering is supported."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    original_path = absolute_path(document.stored_relative_path)
+    return FileResponse(path=original_path, media_type=document.mime_type)
+
+
+@router.get("/{document_id}/thumbnail")
+def thumbnail_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
+    """Returns a generated thumbnail image for dashboard card previews."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    preview_relative_path = document.metadata_json.get("preview_relative_path")
+    if not preview_relative_path:
+        raise HTTPException(status_code=404, detail="Thumbnail not available")
+
+    preview_path = absolute_path(preview_relative_path)
+    if not preview_path.exists():
+        raise HTTPException(status_code=404, detail="Thumbnail file not found")
+    return FileResponse(path=preview_path)
+
+
+@router.get("/{document_id}/content-md")
+def download_document_content_markdown(document_id: UUID, session: Session = Depends(get_session)) -> Response:
+    """Downloads extracted content for one document as a markdown file."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    markdown_content = _markdown_for_document(document)
+    filename = _markdown_filename(document)
+    headers = {"Content-Disposition": f'attachment; filename="{filename}"'}
+    return Response(content=markdown_content, media_type="text/markdown; charset=utf-8", headers=headers)
+
+
+@router.post("/upload", response_model=UploadResponse)
+async def upload_documents(
+    files: Annotated[list[UploadFile], File(description="Files to upload")],
+    relative_paths: Annotated[list[str] | None, Form()] = None,
+    logical_path: Annotated[str, Form()] = "Inbox",
+    tags: Annotated[str | None, Form()] = None,
+    conflict_mode: Annotated[Literal["ask", "replace", "duplicate"], Form()] = "ask",
+    session: Session = Depends(get_session),
+) -> UploadResponse:
+    """Uploads files, records metadata, and enqueues asynchronous extraction tasks."""
+
+    set_processing_log_autocommit(session, True)
+    normalized_tags = _normalize_tags(tags)
+    queue = get_processing_queue()
+    uploaded: list[DocumentResponse] = []
+    conflicts: list[UploadConflict] = []
+
+    indexed_relative_paths = relative_paths or []
+    prepared_uploads: list[dict[str, object]] = []
+
+    for idx, file in enumerate(files):
+        filename = file.filename or f"uploaded_{idx}"
+        data = await file.read()
+        sha256 = compute_sha256(data)
+        source_relative_path = indexed_relative_paths[idx] if idx < len(indexed_relative_paths) else filename
+        extension = Path(filename).suffix.lower()
+        detected_mime = sniff_mime(data)
+        log_processing_event(
+            session=session,
+            stage="upload",
+            event="Upload request received",
+            level="info",
+            document_filename=filename,
+            payload_json={
+                "source_relative_path": source_relative_path,
+                "logical_path": logical_path,
+                "tags": normalized_tags,
+                "mime_type": detected_mime,
+                "size_bytes": len(data),
+                "conflict_mode": conflict_mode,
+            },
+        )
+        prepared_uploads.append(
+            {
+                "filename": filename,
+                "data": data,
+                "sha256": sha256,
+                "source_relative_path": source_relative_path,
+                "extension": extension,
+                "mime_type": detected_mime,
+            }
+        )
+
+        existing = session.execute(select(Document).where(Document.sha256 == sha256)).scalar_one_or_none()
+        if existing and conflict_mode == "ask":
+            log_processing_event(
+                session=session,
+                stage="upload",
+                event="Upload conflict detected",
+                level="warning",
+                document_id=existing.id,
+                document_filename=filename,
+                payload_json={
+                    "sha256": sha256,
+                    "existing_document_id": str(existing.id),
+                },
+            )
+            conflicts.append(
+                UploadConflict(
+                    original_filename=filename,
+                    sha256=sha256,
+                    existing_document_id=existing.id,
+                )
+            )
+
+    if conflicts and conflict_mode == "ask":
+        session.commit()
+        return UploadResponse(uploaded=[], conflicts=conflicts)
+
+    for prepared in prepared_uploads:
+        existing = session.execute(
+            select(Document).where(Document.sha256 == str(prepared["sha256"]))
+        ).scalar_one_or_none()
+        replaces_document_id = existing.id if existing and conflict_mode == "replace" else None
+
+        stored_relative_path = store_bytes(str(prepared["filename"]), bytes(prepared["data"]))
+
+        document = Document(
+            original_filename=str(prepared["filename"]),
+            source_relative_path=str(prepared["source_relative_path"]),
+            stored_relative_path=stored_relative_path,
+            mime_type=str(prepared["mime_type"]),
+            extension=str(prepared["extension"]),
+            sha256=str(prepared["sha256"]),
+            size_bytes=len(bytes(prepared["data"])),
+            logical_path=logical_path,
+            tags=list(normalized_tags),
+            replaces_document_id=replaces_document_id,
+            metadata_json={"upload": "web"},
+        )
+        session.add(document)
+        session.flush()
+        queue.enqueue("app.worker.tasks.process_document_task", str(document.id))
+
+        log_processing_event(
+            session=session,
+            stage="upload",
+            event="Document record created and queued",
+            level="info",
+            document=document,
+            payload_json={
+                "source_relative_path": document.source_relative_path,
+                "stored_relative_path": document.stored_relative_path,
+                "logical_path": document.logical_path,
+                "tags": list(document.tags),
+                "replaces_document_id": str(replaces_document_id) if replaces_document_id is not None else None,
+            },
+        )
+        uploaded.append(DocumentResponse.model_validate(document))
+
+    session.commit()
+    return UploadResponse(uploaded=uploaded, conflicts=conflicts)
+
+
+@router.patch("/{document_id}", response_model=DocumentResponse)
+def update_document(
+    document_id: UUID,
+    payload: DocumentUpdateRequest,
+    session: Session = Depends(get_session),
+) -> DocumentResponse:
+    """Updates document metadata and refreshes semantic index representation."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    if payload.original_filename is not None:
+        document.original_filename = _sanitize_filename(payload.original_filename)
+    if payload.logical_path is not None:
+        document.logical_path = payload.logical_path.strip() or "Inbox"
+    if payload.tags is not None:
+        document.tags = list(dict.fromkeys([tag.strip() for tag in payload.tags if tag.strip()]))[:50]
+
+    try:
+        upsert_document_index(document=document, summary_text=_summary_for_index(document))
+    except Exception:
+        pass
+
+    session.commit()
+    session.refresh(document)
+    return DocumentResponse.model_validate(document)
+
+
+@router.post("/{document_id}/trash", response_model=DocumentResponse)
+def trash_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
+    """Marks a document as trashed without deleting files from storage."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    if document.status != DocumentStatus.TRASHED:
+        document.metadata_json = {
+            **document.metadata_json,
+            "status_before_trash": document.status.value,
+        }
+        document.status = DocumentStatus.TRASHED
+        try:
+            upsert_document_index(document=document, summary_text=_summary_for_index(document))
+        except Exception:
+            pass
+        session.commit()
+        session.refresh(document)
+
+    return DocumentResponse.model_validate(document)
+
+
+@router.post("/{document_id}/restore", response_model=DocumentResponse)
+def restore_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
+    """Restores a trashed document to its previous lifecycle status."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+
+    if document.status == DocumentStatus.TRASHED:
+        fallback = DocumentStatus.PROCESSED if document.processed_at else DocumentStatus.QUEUED
+        restored_status = _resolve_previous_status(document.metadata_json, fallback)
+        document.status = restored_status
+        metadata_json = dict(document.metadata_json)
+        metadata_json.pop("status_before_trash", None)
+        document.metadata_json = metadata_json
+        try:
+            upsert_document_index(document=document, summary_text=_summary_for_index(document))
+        except Exception:
+            pass
+        session.commit()
+        session.refresh(document)
+
+    return DocumentResponse.model_validate(document)
+
+
+@router.delete("/{document_id}")
+def delete_document(document_id: UUID, session: Session = Depends(get_session)) -> dict[str, int]:
+    """Permanently deletes a document and all descendant archive members including stored files."""
+
+    root = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if root is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+    if root.status != DocumentStatus.TRASHED:
+        raise HTTPException(status_code=400, detail="Move document to trash before permanent deletion")
+
+    document_tree = _collect_document_tree(session=session, root_document_id=document_id)
+    document_ids = [document.id for _, document in document_tree]
+    try:
+        delete_many_documents_index([str(current_id) for current_id in document_ids])
+    except Exception:
+        pass
+    try:
+        delete_many_handwriting_style_documents([str(current_id) for current_id in document_ids])
+    except Exception:
+        pass
+
+    deleted_files = 0
+    for _, document in document_tree:
+        source_path = absolute_path(document.stored_relative_path)
+        if source_path.exists() and source_path.is_file():
+            source_path.unlink(missing_ok=True)
+            deleted_files += 1
+
+        preview_relative_path = document.metadata_json.get("preview_relative_path")
+        if isinstance(preview_relative_path, str):
+            preview_path = absolute_path(preview_relative_path)
+            if preview_path.exists() and preview_path.is_file():
+                preview_path.unlink(missing_ok=True)
+
+        session.delete(document)
+
+    session.commit()
+    return {"deleted_documents": len(document_tree), "deleted_files": deleted_files}
+
+
+@router.post("/{document_id}/reprocess", response_model=DocumentResponse)
+def reprocess_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
+    """Re-enqueues a document for extraction and suggestion processing."""
+
+    document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
+    if document is None:
+        raise HTTPException(status_code=404, detail="Document not found")
+    if document.status == DocumentStatus.TRASHED:
+        raise HTTPException(status_code=400, detail="Restore document before reprocessing")
+
+    queue = get_processing_queue()
+    document.status = DocumentStatus.QUEUED
+    try:
+        upsert_document_index(document=document, summary_text=_summary_for_index(document))
+    except Exception:
+        pass
+    session.commit()
+    queue.enqueue("app.worker.tasks.process_document_task", str(document.id))
+    session.refresh(document)
+    return DocumentResponse.model_validate(document)
@@ -0,0 +1,13 @@
+"""Health and readiness endpoints for orchestration and uptime checks."""
+
+from fastapi import APIRouter
+
+
+router = APIRouter(prefix="/health", tags=["health"])
+
+
+@router.get("")
+def health() -> dict[str, str]:
+    """Returns service liveness status."""
+
+    return {"status": "ok"}
@@ -0,0 +1,66 @@
+"""Read-only API endpoints for processing pipeline event logs."""
+
+from uuid import UUID
+
+from fastapi import APIRouter, Depends, Query
+from sqlalchemy.orm import Session
+
+from app.db.base import get_session
+from app.schemas.processing_logs import ProcessingLogEntryResponse, ProcessingLogListResponse
+from app.services.processing_logs import (
+    cleanup_processing_logs,
+    clear_processing_logs,
+    count_processing_logs,
+    list_processing_logs,
+)
+
+
+router = APIRouter()
+
+
+@router.get("", response_model=ProcessingLogListResponse)
+def get_processing_logs(
+    offset: int = Query(default=0, ge=0),
+    limit: int = Query(default=120, ge=1, le=400),
+    document_id: UUID | None = Query(default=None),
+    session: Session = Depends(get_session),
+) -> ProcessingLogListResponse:
+    """Returns paginated processing logs ordered from newest to oldest."""
+
+    items = list_processing_logs(
+        session=session,
+        limit=limit,
+        offset=offset,
+        document_id=document_id,
+    )
+    total = count_processing_logs(session=session, document_id=document_id)
+    return ProcessingLogListResponse(
+        total=total,
+        items=[ProcessingLogEntryResponse.model_validate(item) for item in items],
+    )
+
+
+@router.post("/trim")
+def trim_processing_logs(
+    keep_document_sessions: int = Query(default=2, ge=0, le=20),
+    keep_unbound_entries: int = Query(default=80, ge=0, le=400),
+    session: Session = Depends(get_session),
+) -> dict[str, int]:
+    """Deletes old processing logs while keeping recent document sessions and unbound events."""
+
+    result = cleanup_processing_logs(
+        session=session,
+        keep_document_sessions=keep_document_sessions,
+        keep_unbound_entries=keep_unbound_entries,
+    )
+    session.commit()
+    return result
+
+
+@router.post("/clear")
+def clear_all_processing_logs(session: Session = Depends(get_session)) -> dict[str, int]:
+    """Deletes all processing logs to reset the diagnostics timeline."""
+
+    result = clear_processing_logs(session=session)
+    session.commit()
+    return result
@@ -0,0 +1,84 @@
+"""Search endpoints for full-text and metadata document discovery."""
+
+from fastapi import APIRouter, Depends, Query
+from sqlalchemy import Text, cast, func, select
+from sqlalchemy.orm import Session
+
+from app.api.routes_documents import _apply_discovery_filters
+from app.db.base import get_session
+from app.models.document import Document, DocumentStatus
+from app.schemas.documents import DocumentResponse, SearchResponse
+
+
+router = APIRouter()
+
+
+@router.get("", response_model=SearchResponse)
+def search_documents(
+    query: str = Query(min_length=2),
+    offset: int = Query(default=0, ge=0),
+    limit: int = Query(default=50, ge=1, le=200),
+    include_trashed: bool = Query(default=False),
+    only_trashed: bool = Query(default=False),
+    path_filter: str | None = Query(default=None),
+    tag_filter: str | None = Query(default=None),
+    type_filter: str | None = Query(default=None),
+    processed_from: str | None = Query(default=None),
+    processed_to: str | None = Query(default=None),
+    session: Session = Depends(get_session),
+) -> SearchResponse:
+    """Searches documents using PostgreSQL full-text ranking plus metadata matching."""
+
+    vector = func.to_tsvector(
+        "simple",
+        func.coalesce(Document.original_filename, "")
+        + " "
+        + func.coalesce(Document.logical_path, "")
+        + " "
+        + func.coalesce(Document.extracted_text, "")
+        + " "
+        + func.coalesce(cast(Document.tags, Text), ""),
+    )
+    ts_query = func.plainto_tsquery("simple", query)
+    rank = func.ts_rank_cd(vector, ts_query)
+
+    search_filter = (
+        vector.op("@@")(ts_query)
+        | Document.original_filename.ilike(f"%{query}%")
+        | Document.logical_path.ilike(f"%{query}%")
+        | cast(Document.tags, Text).ilike(f"%{query}%")
+    )
+
+    statement = select(Document).where(search_filter)
+    if only_trashed:
+        statement = statement.where(Document.status == DocumentStatus.TRASHED)
+    elif not include_trashed:
+        statement = statement.where(Document.status != DocumentStatus.TRASHED)
+    statement = _apply_discovery_filters(
+        statement,
+        path_filter=path_filter,
+        tag_filter=tag_filter,
+        type_filter=type_filter,
+        processed_from=processed_from,
+        processed_to=processed_to,
+    )
+    statement = statement.order_by(rank.desc(), Document.created_at.desc()).offset(offset).limit(limit)
+
+    items = session.execute(statement).scalars().all()
+
+    count_statement = select(func.count(Document.id)).where(search_filter)
+    if only_trashed:
+        count_statement = count_statement.where(Document.status == DocumentStatus.TRASHED)
+    elif not include_trashed:
+        count_statement = count_statement.where(Document.status != DocumentStatus.TRASHED)
+    count_statement = _apply_discovery_filters(
+        count_statement,
+        path_filter=path_filter,
+        tag_filter=tag_filter,
+        type_filter=type_filter,
+        processed_from=processed_from,
+        processed_to=processed_to,
+    )
+    total = session.execute(count_statement).scalar_one()
+
+    return SearchResponse(total=total, items=[DocumentResponse.model_validate(item) for item in items])
@@ -0,0 +1,232 @@
+"""API routes for managing persistent single-user application settings."""
+
+from fastapi import APIRouter
+
+from app.schemas.settings import (
+    AppSettingsUpdateRequest,
+    AppSettingsResponse,
+    DisplaySettingsResponse,
+    HandwritingSettingsResponse,
+    HandwritingStyleSettingsResponse,
+    HandwritingSettingsUpdateRequest,
+    OcrTaskSettingsResponse,
+    ProviderSettingsResponse,
+    RoutingTaskSettingsResponse,
+    SummaryTaskSettingsResponse,
+    TaskSettingsResponse,
+    UploadDefaultsResponse,
+)
+from app.services.app_settings import (
+    TASK_OCR_HANDWRITING,
+    TASK_ROUTING_CLASSIFICATION,
+    TASK_SUMMARY_GENERATION,
+    read_app_settings,
+    reset_app_settings,
+    update_app_settings,
+    update_handwriting_settings,
+)
+
+
+router = APIRouter()
+
+
+def _build_response(payload: dict) -> AppSettingsResponse:
+    """Converts internal settings dictionaries into API response models."""
+
+    upload_defaults_payload = payload.get("upload_defaults", {})
+    display_payload = payload.get("display", {})
+    providers_payload = payload.get("providers", [])
+    tasks_payload = payload.get("tasks", {})
+    handwriting_style_payload = payload.get("handwriting_style_clustering", {})
+    ocr_payload = tasks_payload.get(TASK_OCR_HANDWRITING, {})
+    summary_payload = tasks_payload.get(TASK_SUMMARY_GENERATION, {})
+    routing_payload = tasks_payload.get(TASK_ROUTING_CLASSIFICATION, {})
+
+    return AppSettingsResponse(
+        upload_defaults=UploadDefaultsResponse(
+            logical_path=str(upload_defaults_payload.get("logical_path", "Inbox")),
+            tags=[
+                str(tag).strip()
+                for tag in upload_defaults_payload.get("tags", [])
+                if isinstance(tag, str) and tag.strip()
+            ],
+        ),
+        display=DisplaySettingsResponse(
+            cards_per_page=int(display_payload.get("cards_per_page", 12)),
+            log_typing_animation_enabled=bool(display_payload.get("log_typing_animation_enabled", True)),
+        ),
+        handwriting_style_clustering=HandwritingStyleSettingsResponse(
+            enabled=bool(handwriting_style_payload.get("enabled", True)),
+            embed_model=str(handwriting_style_payload.get("embed_model", "ts/clip-vit-b-p32")),
+            neighbor_limit=int(handwriting_style_payload.get("neighbor_limit", 8)),
+            match_min_similarity=float(handwriting_style_payload.get("match_min_similarity", 0.86)),
+            bootstrap_match_min_similarity=float(
+                handwriting_style_payload.get("bootstrap_match_min_similarity", 0.89)
+            ),
+            bootstrap_sample_size=int(handwriting_style_payload.get("bootstrap_sample_size", 3)),
+            image_max_side=int(handwriting_style_payload.get("image_max_side", 1024)),
+        ),
+        predefined_paths=[
+            {
+                "value": str(item.get("value", "")).strip(),
+                "global_shared": bool(item.get("global_shared", False)),
+            }
+            for item in payload.get("predefined_paths", [])
+            if isinstance(item, dict) and str(item.get("value", "")).strip()
+        ],
+        predefined_tags=[
+            {
+                "value": str(item.get("value", "")).strip(),
+                "global_shared": bool(item.get("global_shared", False)),
+            }
+            for item in payload.get("predefined_tags", [])
+            if isinstance(item, dict) and str(item.get("value", "")).strip()
+        ],
+        providers=[
+            ProviderSettingsResponse(
+                id=str(provider.get("id", "")),
+                label=str(provider.get("label", "")),
+                provider_type=str(provider.get("provider_type", "openai_compatible")),
+                base_url=str(provider.get("base_url", "https://api.openai.com/v1")),
+                timeout_seconds=int(provider.get("timeout_seconds", 45)),
+                api_key_set=bool(provider.get("api_key_set", False)),
+                api_key_masked=str(provider.get("api_key_masked", "")),
+            )
+            for provider in providers_payload
+        ],
+        tasks=TaskSettingsResponse(
+            ocr_handwriting=OcrTaskSettingsResponse(
+                enabled=bool(ocr_payload.get("enabled", True)),
+                provider_id=str(ocr_payload.get("provider_id", "openai-default")),
+                model=str(ocr_payload.get("model", "gpt-4.1-mini")),
+                prompt=str(ocr_payload.get("prompt", "")),
+            ),
+            summary_generation=SummaryTaskSettingsResponse(
+                enabled=bool(summary_payload.get("enabled", True)),
+                provider_id=str(summary_payload.get("provider_id", "openai-default")),
+                model=str(summary_payload.get("model", "gpt-4.1-mini")),
+                prompt=str(summary_payload.get("prompt", "")),
+                max_input_tokens=int(summary_payload.get("max_input_tokens", 8000)),
+            ),
+            routing_classification=RoutingTaskSettingsResponse(
+                enabled=bool(routing_payload.get("enabled", True)),
+                provider_id=str(routing_payload.get("provider_id", "openai-default")),
+                model=str(routing_payload.get("model", "gpt-4.1-mini")),
+                prompt=str(routing_payload.get("prompt", "")),
+                neighbor_count=int(routing_payload.get("neighbor_count", 8)),
+                neighbor_min_similarity=float(routing_payload.get("neighbor_min_similarity", 0.84)),
+                auto_apply_confidence_threshold=float(routing_payload.get("auto_apply_confidence_threshold", 0.78)),
+                auto_apply_neighbor_similarity_threshold=float(
+                    routing_payload.get("auto_apply_neighbor_similarity_threshold", 0.55)
+                ),
+                neighbor_path_override_enabled=bool(routing_payload.get("neighbor_path_override_enabled", True)),
+                neighbor_path_override_min_similarity=float(
+                    routing_payload.get("neighbor_path_override_min_similarity", 0.86)
+                ),
+                neighbor_path_override_min_gap=float(routing_payload.get("neighbor_path_override_min_gap", 0.04)),
+                neighbor_path_override_max_confidence=float(
+                    routing_payload.get("neighbor_path_override_max_confidence", 0.9)
+                ),
+            ),
+        ),
+    )
+
+
+@router.get("", response_model=AppSettingsResponse)
+def get_app_settings() -> AppSettingsResponse:
+    """Returns persisted provider and per-task settings configuration."""
+
+    return _build_response(read_app_settings())
+
+
+@router.patch("", response_model=AppSettingsResponse)
+def set_app_settings(payload: AppSettingsUpdateRequest) -> AppSettingsResponse:
+    """Updates providers and task settings and returns resulting persisted configuration."""
+
+    providers_payload = None
+    if payload.providers is not None:
+        providers_payload = [provider.model_dump() for provider in payload.providers]
+
+    tasks_payload = None
+    if payload.tasks is not None:
+        tasks_payload = payload.tasks.model_dump(exclude_none=True)
+
+    upload_defaults_payload = None
+    if payload.upload_defaults is not None:
+        upload_defaults_payload = payload.upload_defaults.model_dump(exclude_none=True)
+
+    display_payload = None
+    if payload.display is not None:
+        display_payload = payload.display.model_dump(exclude_none=True)
+
+    handwriting_style_payload = None
+    if payload.handwriting_style_clustering is not None:
+        handwriting_style_payload = payload.handwriting_style_clustering.model_dump(exclude_none=True)
+    predefined_paths_payload = None
+    if payload.predefined_paths is not None:
+        predefined_paths_payload = [item.model_dump(exclude_none=True) for item in payload.predefined_paths]
+    predefined_tags_payload = None
+    if payload.predefined_tags is not None:
+        predefined_tags_payload = [item.model_dump(exclude_none=True) for item in payload.predefined_tags]
+
+    updated = update_app_settings(
+        providers=providers_payload,
+        tasks=tasks_payload,
+        upload_defaults=upload_defaults_payload,
+        display=display_payload,
+        handwriting_style=handwriting_style_payload,
+        predefined_paths=predefined_paths_payload,
+        predefined_tags=predefined_tags_payload,
+    )
+    return _build_response(updated)
+
+
+@router.post("/reset", response_model=AppSettingsResponse)
+def reset_settings_to_defaults() -> AppSettingsResponse:
+    """Resets all persisted settings to default providers and task bindings."""
+
+    return _build_response(reset_app_settings())
+
+
+@router.patch("/handwriting", response_model=AppSettingsResponse)
+def set_handwriting_settings(payload: HandwritingSettingsUpdateRequest) -> AppSettingsResponse:
+    """Updates handwriting transcription settings and returns the resulting configuration."""
+
+    updated = update_handwriting_settings(
+        enabled=payload.enabled,
+        openai_base_url=payload.openai_base_url,
+        openai_model=payload.openai_model,
+        openai_timeout_seconds=payload.openai_timeout_seconds,
+        openai_api_key=payload.openai_api_key,
+        clear_openai_api_key=payload.clear_openai_api_key,
+    )
+    return _build_response(updated)
+
+
+@router.get("/handwriting", response_model=HandwritingSettingsResponse)
+def get_handwriting_settings() -> HandwritingSettingsResponse:
+    """Returns legacy handwriting response shape for compatibility with older clients."""
+
+    payload = _build_response(read_app_settings())
+    fallback_provider = ProviderSettingsResponse(
+        id="openai-default",
+        label="OpenAI Default",
+        provider_type="openai_compatible",
+        base_url="https://api.openai.com/v1",
+        timeout_seconds=45,
+        api_key_set=False,
+        api_key_masked="",
+    )
+    ocr = payload.tasks.ocr_handwriting
+    provider = next((item for item in payload.providers if item.id == ocr.provider_id), None)
+    if provider is None:
+        provider = payload.providers[0] if payload.providers else fallback_provider
+    return HandwritingSettingsResponse(
+        provider=provider.provider_type,
+        enabled=ocr.enabled,
+        openai_base_url=provider.base_url,
+        openai_model=ocr.model,
+        openai_timeout_seconds=provider.timeout_seconds,
+        openai_api_key_set=provider.api_key_set,
+        openai_api_key_masked=provider.api_key_masked,
+    )
				`@@ -0,0 +1 @@`
				`"""API package containing route modules and router registration."""`