Initial commit

This commit is contained in:
2026-02-21 09:44:18 -03:00
commit 5dfc2cbd85
65 changed files with 11989 additions and 0 deletions

View File

@@ -0,0 +1 @@
"""API package containing route modules and router registration."""

17
backend/app/api/router.py Normal file
View File

@@ -0,0 +1,17 @@
"""API router registration for all HTTP route modules."""
from fastapi import APIRouter
from app.api.routes_documents import router as documents_router
from app.api.routes_health import router as health_router
from app.api.routes_processing_logs import router as processing_logs_router
from app.api.routes_search import router as search_router
from app.api.routes_settings import router as settings_router
api_router = APIRouter()
api_router.include_router(health_router)
api_router.include_router(documents_router, prefix="/documents", tags=["documents"])
api_router.include_router(processing_logs_router, prefix="/processing/logs", tags=["processing-logs"])
api_router.include_router(search_router, prefix="/search", tags=["search"])
api_router.include_router(settings_router, prefix="/settings", tags=["settings"])

View File

@@ -0,0 +1,725 @@
"""Document CRUD, lifecycle, metadata, file access, and content export endpoints."""
import io
import re
import unicodedata
import zipfile
from datetime import datetime, time
from pathlib import Path
from typing import Annotated, Literal
from uuid import UUID
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
from fastapi.responses import FileResponse, Response, StreamingResponse
from sqlalchemy import or_, func, select
from sqlalchemy.orm import Session
from app.services.app_settings import read_predefined_paths_settings, read_predefined_tags_settings
from app.db.base import get_session
from app.models.document import Document, DocumentStatus
from app.schemas.documents import (
ContentExportRequest,
DocumentDetailResponse,
DocumentResponse,
DocumentsListResponse,
DocumentUpdateRequest,
UploadConflict,
UploadResponse,
)
from app.services.extractor import sniff_mime
from app.services.handwriting_style import delete_many_handwriting_style_documents
from app.services.processing_logs import log_processing_event, set_processing_log_autocommit
from app.services.storage import absolute_path, compute_sha256, store_bytes
from app.services.typesense_index import delete_many_documents_index, upsert_document_index
from app.worker.queue import get_processing_queue
router = APIRouter()
def _parse_csv(value: str | None) -> list[str]:
"""Parses comma-separated query values into a normalized non-empty list."""
if not value:
return []
return [part.strip() for part in value.split(",") if part.strip()]
def _parse_date(value: str | None) -> datetime | None:
"""Parses ISO date strings into UTC-naive midnight datetimes."""
if not value:
return None
try:
parsed = datetime.fromisoformat(value)
return parsed
except ValueError:
pass
try:
date_value = datetime.strptime(value, "%Y-%m-%d").date()
return datetime.combine(date_value, time.min)
except ValueError:
return None
def _apply_discovery_filters(
statement,
*,
path_filter: str | None,
tag_filter: str | None,
type_filter: str | None,
processed_from: str | None,
processed_to: str | None,
):
"""Applies optional path/tag/type/date filters to list and search statements."""
if path_filter and path_filter.strip():
statement = statement.where(Document.logical_path.ilike(f"{path_filter.strip()}%"))
tags = _parse_csv(tag_filter)
if tags:
statement = statement.where(Document.tags.overlap(tags))
types = _parse_csv(type_filter)
if types:
type_clauses = []
for value in types:
lowered = value.lower()
type_clauses.append(Document.extension.ilike(lowered))
type_clauses.append(Document.mime_type.ilike(lowered))
type_clauses.append(Document.image_text_type.ilike(lowered))
statement = statement.where(or_(*type_clauses))
processed_from_dt = _parse_date(processed_from)
if processed_from_dt is not None:
statement = statement.where(Document.processed_at.is_not(None), Document.processed_at >= processed_from_dt)
processed_to_dt = _parse_date(processed_to)
if processed_to_dt is not None:
statement = statement.where(Document.processed_at.is_not(None), Document.processed_at <= processed_to_dt)
return statement
def _summary_for_index(document: Document) -> str:
"""Resolves best-available summary text for semantic index updates outside worker pipeline."""
candidate = document.metadata_json.get("summary_text")
if isinstance(candidate, str) and candidate.strip():
return candidate.strip()
extracted = document.extracted_text.strip()
if extracted:
return extracted[:12000]
return f"{document.original_filename}\n{document.mime_type}\n{document.logical_path}"
def _normalize_tags(raw_tags: str | None) -> list[str]:
"""Parses comma-separated tags into a cleaned unique list."""
if not raw_tags:
return []
tags = [tag.strip() for tag in raw_tags.split(",") if tag.strip()]
return list(dict.fromkeys(tags))[:50]
def _sanitize_filename(filename: str) -> str:
"""Normalizes user-supplied filenames while preserving readability and extensions."""
base = filename.strip().replace("\\", " ").replace("/", " ")
base = re.sub(r"\s+", " ", base)
return base[:512] or "document"
def _slugify_segment(value: str) -> str:
"""Creates a filesystem-safe slug for path segments and markdown file names."""
normalized = unicodedata.normalize("NFKD", value)
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
cleaned = re.sub(r"[^a-zA-Z0-9._ -]+", "", ascii_text).strip()
compact = re.sub(r"\s+", "-", cleaned)
compact = compact.strip(".-_")
return compact[:120] or "document"
def _markdown_for_document(document: Document) -> str:
"""Builds a markdown representation of extracted document content and metadata."""
lines = [
f"# {document.original_filename}",
"",
f"- Document ID: `{document.id}`",
f"- Logical Path: `{document.logical_path}`",
f"- Source Path: `{document.source_relative_path}`",
f"- Tags: {', '.join(document.tags) if document.tags else '(none)' }",
"",
"## Extracted Content",
"",
]
if document.extracted_text.strip():
lines.append(document.extracted_text)
else:
lines.append("_No extracted text available for this document._")
return "\n".join(lines).strip() + "\n"
def _markdown_filename(document: Document) -> str:
"""Builds a deterministic markdown filename for a single document export."""
stem = Path(document.original_filename).stem or document.original_filename
slug = _slugify_segment(stem)
return f"{slug}-{str(document.id)[:8]}.md"
def _zip_entry_name(document: Document, used_names: set[str]) -> str:
"""Builds a unique zip entry path for a document markdown export."""
path_segments = [segment for segment in document.logical_path.split("/") if segment]
sanitized_segments = [_slugify_segment(segment) for segment in path_segments]
filename = _markdown_filename(document)
base_entry = "/".join([*sanitized_segments, filename]) if sanitized_segments else filename
entry = base_entry
suffix = 1
while entry in used_names:
stem = Path(filename).stem
ext = Path(filename).suffix
candidate = f"{stem}-{suffix}{ext}"
entry = "/".join([*sanitized_segments, candidate]) if sanitized_segments else candidate
suffix += 1
used_names.add(entry)
return entry
def _resolve_previous_status(metadata_json: dict, fallback_status: DocumentStatus) -> DocumentStatus:
"""Resolves the status to restore from trash using recorded metadata."""
raw_status = metadata_json.get("status_before_trash")
if isinstance(raw_status, str):
try:
parsed = DocumentStatus(raw_status)
if parsed != DocumentStatus.TRASHED:
return parsed
except ValueError:
pass
return fallback_status
def _build_document_list_statement(
only_trashed: bool,
include_trashed: bool,
path_prefix: str | None,
):
"""Builds a base SQLAlchemy select statement with lifecycle and path filters."""
statement = select(Document)
if only_trashed:
statement = statement.where(Document.status == DocumentStatus.TRASHED)
elif not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
if path_prefix:
trimmed_prefix = path_prefix.strip()
if trimmed_prefix:
statement = statement.where(Document.logical_path.ilike(f"{trimmed_prefix}%"))
return statement
def _collect_document_tree(session: Session, root_document_id: UUID) -> list[tuple[int, Document]]:
"""Collects a document and all descendants for recursive permanent deletion."""
queue: list[tuple[UUID, int]] = [(root_document_id, 0)]
visited: set[UUID] = set()
collected: list[tuple[int, Document]] = []
while queue:
current_id, depth = queue.pop(0)
if current_id in visited:
continue
visited.add(current_id)
document = session.execute(select(Document).where(Document.id == current_id)).scalar_one_or_none()
if document is None:
continue
collected.append((depth, document))
child_ids = session.execute(
select(Document.id).where(Document.parent_document_id == current_id)
).scalars().all()
for child_id in child_ids:
queue.append((child_id, depth + 1))
collected.sort(key=lambda item: item[0], reverse=True)
return collected
@router.get("", response_model=DocumentsListResponse)
def list_documents(
offset: int = Query(default=0, ge=0),
limit: int = Query(default=50, ge=1, le=200),
include_trashed: bool = Query(default=False),
only_trashed: bool = Query(default=False),
path_prefix: str | None = Query(default=None),
path_filter: str | None = Query(default=None),
tag_filter: str | None = Query(default=None),
type_filter: str | None = Query(default=None),
processed_from: str | None = Query(default=None),
processed_to: str | None = Query(default=None),
session: Session = Depends(get_session),
) -> DocumentsListResponse:
"""Returns paginated documents ordered by newest upload timestamp."""
base_statement = _build_document_list_statement(
only_trashed=only_trashed,
include_trashed=include_trashed,
path_prefix=path_prefix,
)
base_statement = _apply_discovery_filters(
base_statement,
path_filter=path_filter,
tag_filter=tag_filter,
type_filter=type_filter,
processed_from=processed_from,
processed_to=processed_to,
)
statement = base_statement.order_by(Document.created_at.desc()).offset(offset).limit(limit)
items = session.execute(statement).scalars().all()
count_statement = select(func.count()).select_from(base_statement.subquery())
total = session.execute(count_statement).scalar_one()
return DocumentsListResponse(total=total, items=[DocumentResponse.model_validate(item) for item in items])
@router.get("/tags")
def list_tags(
include_trashed: bool = Query(default=False),
session: Session = Depends(get_session),
) -> dict[str, list[str]]:
"""Returns distinct tags currently assigned across all matching documents."""
statement = select(Document.tags)
if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).scalars().all()
tags = {tag for row in rows for tag in row if tag}
tags.update(
str(item.get("value", "")).strip()
for item in read_predefined_tags_settings()
if str(item.get("value", "")).strip()
)
tags = sorted(tags)
return {"tags": tags}
@router.get("/paths")
def list_paths(
include_trashed: bool = Query(default=False),
session: Session = Depends(get_session),
) -> dict[str, list[str]]:
"""Returns distinct logical paths currently assigned across all matching documents."""
statement = select(Document.logical_path)
if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).scalars().all()
paths = {row for row in rows if row}
paths.update(
str(item.get("value", "")).strip()
for item in read_predefined_paths_settings()
if str(item.get("value", "")).strip()
)
paths = sorted(paths)
return {"paths": paths}
@router.get("/types")
def list_types(
include_trashed: bool = Query(default=False),
session: Session = Depends(get_session),
) -> dict[str, list[str]]:
"""Returns distinct document type values from extension, MIME, and image text type."""
statement = select(Document.extension, Document.mime_type, Document.image_text_type)
if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).all()
values: set[str] = set()
for extension, mime_type, image_text_type in rows:
for candidate in (extension, mime_type, image_text_type):
normalized = str(candidate).strip().lower() if isinstance(candidate, str) else ""
if normalized:
values.add(normalized)
return {"types": sorted(values)}
@router.post("/content-md/export")
def export_contents_markdown(
payload: ContentExportRequest,
session: Session = Depends(get_session),
) -> StreamingResponse:
"""Exports extracted contents for selected documents as individual markdown files in a ZIP archive."""
has_document_ids = len(payload.document_ids) > 0
has_path_prefix = bool(payload.path_prefix and payload.path_prefix.strip())
if not has_document_ids and not has_path_prefix:
raise HTTPException(status_code=400, detail="Provide document_ids or path_prefix for export")
statement = select(Document)
if has_document_ids:
statement = statement.where(Document.id.in_(payload.document_ids))
if has_path_prefix:
statement = statement.where(Document.logical_path.ilike(f"{payload.path_prefix.strip()}%"))
if payload.only_trashed:
statement = statement.where(Document.status == DocumentStatus.TRASHED)
elif not payload.include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
documents = session.execute(statement.order_by(Document.logical_path.asc(), Document.created_at.asc())).scalars().all()
if not documents:
raise HTTPException(status_code=404, detail="No matching documents found for export")
archive_buffer = io.BytesIO()
used_entries: set[str] = set()
with zipfile.ZipFile(archive_buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
for document in documents:
entry_name = _zip_entry_name(document, used_entries)
archive.writestr(entry_name, _markdown_for_document(document))
archive_buffer.seek(0)
headers = {"Content-Disposition": 'attachment; filename="document-contents-md.zip"'}
return StreamingResponse(archive_buffer, media_type="application/zip", headers=headers)
@router.get("/{document_id}", response_model=DocumentDetailResponse)
def get_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentDetailResponse:
"""Returns one document by unique identifier."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
return DocumentDetailResponse.model_validate(document)
@router.get("/{document_id}/download")
def download_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
"""Downloads original document bytes for the requested document identifier."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
file_path = absolute_path(document.stored_relative_path)
return FileResponse(path=file_path, filename=document.original_filename, media_type=document.mime_type)
@router.get("/{document_id}/preview")
def preview_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
"""Streams the original document inline when browser rendering is supported."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
original_path = absolute_path(document.stored_relative_path)
return FileResponse(path=original_path, media_type=document.mime_type)
@router.get("/{document_id}/thumbnail")
def thumbnail_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
"""Returns a generated thumbnail image for dashboard card previews."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
preview_relative_path = document.metadata_json.get("preview_relative_path")
if not preview_relative_path:
raise HTTPException(status_code=404, detail="Thumbnail not available")
preview_path = absolute_path(preview_relative_path)
if not preview_path.exists():
raise HTTPException(status_code=404, detail="Thumbnail file not found")
return FileResponse(path=preview_path)
@router.get("/{document_id}/content-md")
def download_document_content_markdown(document_id: UUID, session: Session = Depends(get_session)) -> Response:
"""Downloads extracted content for one document as a markdown file."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
markdown_content = _markdown_for_document(document)
filename = _markdown_filename(document)
headers = {"Content-Disposition": f'attachment; filename="{filename}"'}
return Response(content=markdown_content, media_type="text/markdown; charset=utf-8", headers=headers)
@router.post("/upload", response_model=UploadResponse)
async def upload_documents(
files: Annotated[list[UploadFile], File(description="Files to upload")],
relative_paths: Annotated[list[str] | None, Form()] = None,
logical_path: Annotated[str, Form()] = "Inbox",
tags: Annotated[str | None, Form()] = None,
conflict_mode: Annotated[Literal["ask", "replace", "duplicate"], Form()] = "ask",
session: Session = Depends(get_session),
) -> UploadResponse:
"""Uploads files, records metadata, and enqueues asynchronous extraction tasks."""
set_processing_log_autocommit(session, True)
normalized_tags = _normalize_tags(tags)
queue = get_processing_queue()
uploaded: list[DocumentResponse] = []
conflicts: list[UploadConflict] = []
indexed_relative_paths = relative_paths or []
prepared_uploads: list[dict[str, object]] = []
for idx, file in enumerate(files):
filename = file.filename or f"uploaded_{idx}"
data = await file.read()
sha256 = compute_sha256(data)
source_relative_path = indexed_relative_paths[idx] if idx < len(indexed_relative_paths) else filename
extension = Path(filename).suffix.lower()
detected_mime = sniff_mime(data)
log_processing_event(
session=session,
stage="upload",
event="Upload request received",
level="info",
document_filename=filename,
payload_json={
"source_relative_path": source_relative_path,
"logical_path": logical_path,
"tags": normalized_tags,
"mime_type": detected_mime,
"size_bytes": len(data),
"conflict_mode": conflict_mode,
},
)
prepared_uploads.append(
{
"filename": filename,
"data": data,
"sha256": sha256,
"source_relative_path": source_relative_path,
"extension": extension,
"mime_type": detected_mime,
}
)
existing = session.execute(select(Document).where(Document.sha256 == sha256)).scalar_one_or_none()
if existing and conflict_mode == "ask":
log_processing_event(
session=session,
stage="upload",
event="Upload conflict detected",
level="warning",
document_id=existing.id,
document_filename=filename,
payload_json={
"sha256": sha256,
"existing_document_id": str(existing.id),
},
)
conflicts.append(
UploadConflict(
original_filename=filename,
sha256=sha256,
existing_document_id=existing.id,
)
)
if conflicts and conflict_mode == "ask":
session.commit()
return UploadResponse(uploaded=[], conflicts=conflicts)
for prepared in prepared_uploads:
existing = session.execute(
select(Document).where(Document.sha256 == str(prepared["sha256"]))
).scalar_one_or_none()
replaces_document_id = existing.id if existing and conflict_mode == "replace" else None
stored_relative_path = store_bytes(str(prepared["filename"]), bytes(prepared["data"]))
document = Document(
original_filename=str(prepared["filename"]),
source_relative_path=str(prepared["source_relative_path"]),
stored_relative_path=stored_relative_path,
mime_type=str(prepared["mime_type"]),
extension=str(prepared["extension"]),
sha256=str(prepared["sha256"]),
size_bytes=len(bytes(prepared["data"])),
logical_path=logical_path,
tags=list(normalized_tags),
replaces_document_id=replaces_document_id,
metadata_json={"upload": "web"},
)
session.add(document)
session.flush()
queue.enqueue("app.worker.tasks.process_document_task", str(document.id))
log_processing_event(
session=session,
stage="upload",
event="Document record created and queued",
level="info",
document=document,
payload_json={
"source_relative_path": document.source_relative_path,
"stored_relative_path": document.stored_relative_path,
"logical_path": document.logical_path,
"tags": list(document.tags),
"replaces_document_id": str(replaces_document_id) if replaces_document_id is not None else None,
},
)
uploaded.append(DocumentResponse.model_validate(document))
session.commit()
return UploadResponse(uploaded=uploaded, conflicts=conflicts)
@router.patch("/{document_id}", response_model=DocumentResponse)
def update_document(
document_id: UUID,
payload: DocumentUpdateRequest,
session: Session = Depends(get_session),
) -> DocumentResponse:
"""Updates document metadata and refreshes semantic index representation."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if payload.original_filename is not None:
document.original_filename = _sanitize_filename(payload.original_filename)
if payload.logical_path is not None:
document.logical_path = payload.logical_path.strip() or "Inbox"
if payload.tags is not None:
document.tags = list(dict.fromkeys([tag.strip() for tag in payload.tags if tag.strip()]))[:50]
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
session.refresh(document)
return DocumentResponse.model_validate(document)
@router.post("/{document_id}/trash", response_model=DocumentResponse)
def trash_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
"""Marks a document as trashed without deleting files from storage."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if document.status != DocumentStatus.TRASHED:
document.metadata_json = {
**document.metadata_json,
"status_before_trash": document.status.value,
}
document.status = DocumentStatus.TRASHED
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
session.refresh(document)
return DocumentResponse.model_validate(document)
@router.post("/{document_id}/restore", response_model=DocumentResponse)
def restore_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
"""Restores a trashed document to its previous lifecycle status."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if document.status == DocumentStatus.TRASHED:
fallback = DocumentStatus.PROCESSED if document.processed_at else DocumentStatus.QUEUED
restored_status = _resolve_previous_status(document.metadata_json, fallback)
document.status = restored_status
metadata_json = dict(document.metadata_json)
metadata_json.pop("status_before_trash", None)
document.metadata_json = metadata_json
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
session.refresh(document)
return DocumentResponse.model_validate(document)
@router.delete("/{document_id}")
def delete_document(document_id: UUID, session: Session = Depends(get_session)) -> dict[str, int]:
"""Permanently deletes a document and all descendant archive members including stored files."""
root = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if root is None:
raise HTTPException(status_code=404, detail="Document not found")
if root.status != DocumentStatus.TRASHED:
raise HTTPException(status_code=400, detail="Move document to trash before permanent deletion")
document_tree = _collect_document_tree(session=session, root_document_id=document_id)
document_ids = [document.id for _, document in document_tree]
try:
delete_many_documents_index([str(current_id) for current_id in document_ids])
except Exception:
pass
try:
delete_many_handwriting_style_documents([str(current_id) for current_id in document_ids])
except Exception:
pass
deleted_files = 0
for _, document in document_tree:
source_path = absolute_path(document.stored_relative_path)
if source_path.exists() and source_path.is_file():
source_path.unlink(missing_ok=True)
deleted_files += 1
preview_relative_path = document.metadata_json.get("preview_relative_path")
if isinstance(preview_relative_path, str):
preview_path = absolute_path(preview_relative_path)
if preview_path.exists() and preview_path.is_file():
preview_path.unlink(missing_ok=True)
session.delete(document)
session.commit()
return {"deleted_documents": len(document_tree), "deleted_files": deleted_files}
@router.post("/{document_id}/reprocess", response_model=DocumentResponse)
def reprocess_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
"""Re-enqueues a document for extraction and suggestion processing."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if document.status == DocumentStatus.TRASHED:
raise HTTPException(status_code=400, detail="Restore document before reprocessing")
queue = get_processing_queue()
document.status = DocumentStatus.QUEUED
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
queue.enqueue("app.worker.tasks.process_document_task", str(document.id))
session.refresh(document)
return DocumentResponse.model_validate(document)

View File

@@ -0,0 +1,13 @@
"""Health and readiness endpoints for orchestration and uptime checks."""
from fastapi import APIRouter
router = APIRouter(prefix="/health", tags=["health"])
@router.get("")
def health() -> dict[str, str]:
"""Returns service liveness status."""
return {"status": "ok"}

View File

@@ -0,0 +1,66 @@
"""Read-only API endpoints for processing pipeline event logs."""
from uuid import UUID
from fastapi import APIRouter, Depends, Query
from sqlalchemy.orm import Session
from app.db.base import get_session
from app.schemas.processing_logs import ProcessingLogEntryResponse, ProcessingLogListResponse
from app.services.processing_logs import (
cleanup_processing_logs,
clear_processing_logs,
count_processing_logs,
list_processing_logs,
)
router = APIRouter()
@router.get("", response_model=ProcessingLogListResponse)
def get_processing_logs(
offset: int = Query(default=0, ge=0),
limit: int = Query(default=120, ge=1, le=400),
document_id: UUID | None = Query(default=None),
session: Session = Depends(get_session),
) -> ProcessingLogListResponse:
"""Returns paginated processing logs ordered from newest to oldest."""
items = list_processing_logs(
session=session,
limit=limit,
offset=offset,
document_id=document_id,
)
total = count_processing_logs(session=session, document_id=document_id)
return ProcessingLogListResponse(
total=total,
items=[ProcessingLogEntryResponse.model_validate(item) for item in items],
)
@router.post("/trim")
def trim_processing_logs(
keep_document_sessions: int = Query(default=2, ge=0, le=20),
keep_unbound_entries: int = Query(default=80, ge=0, le=400),
session: Session = Depends(get_session),
) -> dict[str, int]:
"""Deletes old processing logs while keeping recent document sessions and unbound events."""
result = cleanup_processing_logs(
session=session,
keep_document_sessions=keep_document_sessions,
keep_unbound_entries=keep_unbound_entries,
)
session.commit()
return result
@router.post("/clear")
def clear_all_processing_logs(session: Session = Depends(get_session)) -> dict[str, int]:
"""Deletes all processing logs to reset the diagnostics timeline."""
result = clear_processing_logs(session=session)
session.commit()
return result

View File

@@ -0,0 +1,84 @@
"""Search endpoints for full-text and metadata document discovery."""
from fastapi import APIRouter, Depends, Query
from sqlalchemy import Text, cast, func, select
from sqlalchemy.orm import Session
from app.api.routes_documents import _apply_discovery_filters
from app.db.base import get_session
from app.models.document import Document, DocumentStatus
from app.schemas.documents import DocumentResponse, SearchResponse
router = APIRouter()
@router.get("", response_model=SearchResponse)
def search_documents(
query: str = Query(min_length=2),
offset: int = Query(default=0, ge=0),
limit: int = Query(default=50, ge=1, le=200),
include_trashed: bool = Query(default=False),
only_trashed: bool = Query(default=False),
path_filter: str | None = Query(default=None),
tag_filter: str | None = Query(default=None),
type_filter: str | None = Query(default=None),
processed_from: str | None = Query(default=None),
processed_to: str | None = Query(default=None),
session: Session = Depends(get_session),
) -> SearchResponse:
"""Searches documents using PostgreSQL full-text ranking plus metadata matching."""
vector = func.to_tsvector(
"simple",
func.coalesce(Document.original_filename, "")
+ " "
+ func.coalesce(Document.logical_path, "")
+ " "
+ func.coalesce(Document.extracted_text, "")
+ " "
+ func.coalesce(cast(Document.tags, Text), ""),
)
ts_query = func.plainto_tsquery("simple", query)
rank = func.ts_rank_cd(vector, ts_query)
search_filter = (
vector.op("@@")(ts_query)
| Document.original_filename.ilike(f"%{query}%")
| Document.logical_path.ilike(f"%{query}%")
| cast(Document.tags, Text).ilike(f"%{query}%")
)
statement = select(Document).where(search_filter)
if only_trashed:
statement = statement.where(Document.status == DocumentStatus.TRASHED)
elif not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
statement = _apply_discovery_filters(
statement,
path_filter=path_filter,
tag_filter=tag_filter,
type_filter=type_filter,
processed_from=processed_from,
processed_to=processed_to,
)
statement = statement.order_by(rank.desc(), Document.created_at.desc()).offset(offset).limit(limit)
items = session.execute(statement).scalars().all()
count_statement = select(func.count(Document.id)).where(search_filter)
if only_trashed:
count_statement = count_statement.where(Document.status == DocumentStatus.TRASHED)
elif not include_trashed:
count_statement = count_statement.where(Document.status != DocumentStatus.TRASHED)
count_statement = _apply_discovery_filters(
count_statement,
path_filter=path_filter,
tag_filter=tag_filter,
type_filter=type_filter,
processed_from=processed_from,
processed_to=processed_to,
)
total = session.execute(count_statement).scalar_one()
return SearchResponse(total=total, items=[DocumentResponse.model_validate(item) for item in items])

View File

@@ -0,0 +1,232 @@
"""API routes for managing persistent single-user application settings."""
from fastapi import APIRouter
from app.schemas.settings import (
AppSettingsUpdateRequest,
AppSettingsResponse,
DisplaySettingsResponse,
HandwritingSettingsResponse,
HandwritingStyleSettingsResponse,
HandwritingSettingsUpdateRequest,
OcrTaskSettingsResponse,
ProviderSettingsResponse,
RoutingTaskSettingsResponse,
SummaryTaskSettingsResponse,
TaskSettingsResponse,
UploadDefaultsResponse,
)
from app.services.app_settings import (
TASK_OCR_HANDWRITING,
TASK_ROUTING_CLASSIFICATION,
TASK_SUMMARY_GENERATION,
read_app_settings,
reset_app_settings,
update_app_settings,
update_handwriting_settings,
)
router = APIRouter()
def _build_response(payload: dict) -> AppSettingsResponse:
"""Converts internal settings dictionaries into API response models."""
upload_defaults_payload = payload.get("upload_defaults", {})
display_payload = payload.get("display", {})
providers_payload = payload.get("providers", [])
tasks_payload = payload.get("tasks", {})
handwriting_style_payload = payload.get("handwriting_style_clustering", {})
ocr_payload = tasks_payload.get(TASK_OCR_HANDWRITING, {})
summary_payload = tasks_payload.get(TASK_SUMMARY_GENERATION, {})
routing_payload = tasks_payload.get(TASK_ROUTING_CLASSIFICATION, {})
return AppSettingsResponse(
upload_defaults=UploadDefaultsResponse(
logical_path=str(upload_defaults_payload.get("logical_path", "Inbox")),
tags=[
str(tag).strip()
for tag in upload_defaults_payload.get("tags", [])
if isinstance(tag, str) and tag.strip()
],
),
display=DisplaySettingsResponse(
cards_per_page=int(display_payload.get("cards_per_page", 12)),
log_typing_animation_enabled=bool(display_payload.get("log_typing_animation_enabled", True)),
),
handwriting_style_clustering=HandwritingStyleSettingsResponse(
enabled=bool(handwriting_style_payload.get("enabled", True)),
embed_model=str(handwriting_style_payload.get("embed_model", "ts/clip-vit-b-p32")),
neighbor_limit=int(handwriting_style_payload.get("neighbor_limit", 8)),
match_min_similarity=float(handwriting_style_payload.get("match_min_similarity", 0.86)),
bootstrap_match_min_similarity=float(
handwriting_style_payload.get("bootstrap_match_min_similarity", 0.89)
),
bootstrap_sample_size=int(handwriting_style_payload.get("bootstrap_sample_size", 3)),
image_max_side=int(handwriting_style_payload.get("image_max_side", 1024)),
),
predefined_paths=[
{
"value": str(item.get("value", "")).strip(),
"global_shared": bool(item.get("global_shared", False)),
}
for item in payload.get("predefined_paths", [])
if isinstance(item, dict) and str(item.get("value", "")).strip()
],
predefined_tags=[
{
"value": str(item.get("value", "")).strip(),
"global_shared": bool(item.get("global_shared", False)),
}
for item in payload.get("predefined_tags", [])
if isinstance(item, dict) and str(item.get("value", "")).strip()
],
providers=[
ProviderSettingsResponse(
id=str(provider.get("id", "")),
label=str(provider.get("label", "")),
provider_type=str(provider.get("provider_type", "openai_compatible")),
base_url=str(provider.get("base_url", "https://api.openai.com/v1")),
timeout_seconds=int(provider.get("timeout_seconds", 45)),
api_key_set=bool(provider.get("api_key_set", False)),
api_key_masked=str(provider.get("api_key_masked", "")),
)
for provider in providers_payload
],
tasks=TaskSettingsResponse(
ocr_handwriting=OcrTaskSettingsResponse(
enabled=bool(ocr_payload.get("enabled", True)),
provider_id=str(ocr_payload.get("provider_id", "openai-default")),
model=str(ocr_payload.get("model", "gpt-4.1-mini")),
prompt=str(ocr_payload.get("prompt", "")),
),
summary_generation=SummaryTaskSettingsResponse(
enabled=bool(summary_payload.get("enabled", True)),
provider_id=str(summary_payload.get("provider_id", "openai-default")),
model=str(summary_payload.get("model", "gpt-4.1-mini")),
prompt=str(summary_payload.get("prompt", "")),
max_input_tokens=int(summary_payload.get("max_input_tokens", 8000)),
),
routing_classification=RoutingTaskSettingsResponse(
enabled=bool(routing_payload.get("enabled", True)),
provider_id=str(routing_payload.get("provider_id", "openai-default")),
model=str(routing_payload.get("model", "gpt-4.1-mini")),
prompt=str(routing_payload.get("prompt", "")),
neighbor_count=int(routing_payload.get("neighbor_count", 8)),
neighbor_min_similarity=float(routing_payload.get("neighbor_min_similarity", 0.84)),
auto_apply_confidence_threshold=float(routing_payload.get("auto_apply_confidence_threshold", 0.78)),
auto_apply_neighbor_similarity_threshold=float(
routing_payload.get("auto_apply_neighbor_similarity_threshold", 0.55)
),
neighbor_path_override_enabled=bool(routing_payload.get("neighbor_path_override_enabled", True)),
neighbor_path_override_min_similarity=float(
routing_payload.get("neighbor_path_override_min_similarity", 0.86)
),
neighbor_path_override_min_gap=float(routing_payload.get("neighbor_path_override_min_gap", 0.04)),
neighbor_path_override_max_confidence=float(
routing_payload.get("neighbor_path_override_max_confidence", 0.9)
),
),
),
)
@router.get("", response_model=AppSettingsResponse)
def get_app_settings() -> AppSettingsResponse:
"""Returns persisted provider and per-task settings configuration."""
return _build_response(read_app_settings())
@router.patch("", response_model=AppSettingsResponse)
def set_app_settings(payload: AppSettingsUpdateRequest) -> AppSettingsResponse:
"""Updates providers and task settings and returns resulting persisted configuration."""
providers_payload = None
if payload.providers is not None:
providers_payload = [provider.model_dump() for provider in payload.providers]
tasks_payload = None
if payload.tasks is not None:
tasks_payload = payload.tasks.model_dump(exclude_none=True)
upload_defaults_payload = None
if payload.upload_defaults is not None:
upload_defaults_payload = payload.upload_defaults.model_dump(exclude_none=True)
display_payload = None
if payload.display is not None:
display_payload = payload.display.model_dump(exclude_none=True)
handwriting_style_payload = None
if payload.handwriting_style_clustering is not None:
handwriting_style_payload = payload.handwriting_style_clustering.model_dump(exclude_none=True)
predefined_paths_payload = None
if payload.predefined_paths is not None:
predefined_paths_payload = [item.model_dump(exclude_none=True) for item in payload.predefined_paths]
predefined_tags_payload = None
if payload.predefined_tags is not None:
predefined_tags_payload = [item.model_dump(exclude_none=True) for item in payload.predefined_tags]
updated = update_app_settings(
providers=providers_payload,
tasks=tasks_payload,
upload_defaults=upload_defaults_payload,
display=display_payload,
handwriting_style=handwriting_style_payload,
predefined_paths=predefined_paths_payload,
predefined_tags=predefined_tags_payload,
)
return _build_response(updated)
@router.post("/reset", response_model=AppSettingsResponse)
def reset_settings_to_defaults() -> AppSettingsResponse:
"""Resets all persisted settings to default providers and task bindings."""
return _build_response(reset_app_settings())
@router.patch("/handwriting", response_model=AppSettingsResponse)
def set_handwriting_settings(payload: HandwritingSettingsUpdateRequest) -> AppSettingsResponse:
"""Updates handwriting transcription settings and returns the resulting configuration."""
updated = update_handwriting_settings(
enabled=payload.enabled,
openai_base_url=payload.openai_base_url,
openai_model=payload.openai_model,
openai_timeout_seconds=payload.openai_timeout_seconds,
openai_api_key=payload.openai_api_key,
clear_openai_api_key=payload.clear_openai_api_key,
)
return _build_response(updated)
@router.get("/handwriting", response_model=HandwritingSettingsResponse)
def get_handwriting_settings() -> HandwritingSettingsResponse:
"""Returns legacy handwriting response shape for compatibility with older clients."""
payload = _build_response(read_app_settings())
fallback_provider = ProviderSettingsResponse(
id="openai-default",
label="OpenAI Default",
provider_type="openai_compatible",
base_url="https://api.openai.com/v1",
timeout_seconds=45,
api_key_set=False,
api_key_masked="",
)
ocr = payload.tasks.ocr_handwriting
provider = next((item for item in payload.providers if item.id == ocr.provider_id), None)
if provider is None:
provider = payload.providers[0] if payload.providers else fallback_provider
return HandwritingSettingsResponse(
provider=provider.provider_type,
enabled=ocr.enabled,
openai_base_url=provider.base_url,
openai_model=ocr.model,
openai_timeout_seconds=provider.timeout_seconds,
openai_api_key_set=provider.api_key_set,
openai_api_key_masked=provider.api_key_masked,
)