Harden auth and security controls with session auth and docs
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
"""Authenticated document CRUD, lifecycle, metadata, file access, and content export endpoints."""
|
||||
|
||||
import io
|
||||
import re
|
||||
import tempfile
|
||||
import unicodedata
|
||||
import zipfile
|
||||
from datetime import datetime, time
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Literal
|
||||
from typing import Annotated, BinaryIO, Iterator, Literal
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
|
||||
@@ -14,8 +14,10 @@ from fastapi.responses import FileResponse, Response, StreamingResponse
|
||||
from sqlalchemy import or_, func, select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.api.auth import AuthContext, require_user_or_admin
|
||||
from app.core.config import get_settings, is_inline_preview_mime_type_safe
|
||||
from app.db.base import get_session
|
||||
from app.models.auth import UserRole
|
||||
from app.models.document import Document, DocumentStatus
|
||||
from app.schemas.documents import (
|
||||
ContentExportRequest,
|
||||
@@ -30,6 +32,7 @@ from app.services.app_settings import read_predefined_paths_settings, read_prede
|
||||
from app.services.extractor import sniff_mime
|
||||
from app.services.handwriting_style import delete_many_handwriting_style_documents
|
||||
from app.services.processing_logs import log_processing_event, set_processing_log_autocommit
|
||||
from app.services.rate_limiter import increment_rate_limit
|
||||
from app.services.storage import absolute_path, compute_sha256, store_bytes
|
||||
from app.services.typesense_index import delete_many_documents_index, upsert_document_index
|
||||
from app.worker.queue import get_processing_queue
|
||||
@@ -39,6 +42,59 @@ router = APIRouter()
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
def _scope_document_statement_for_auth_context(statement, auth_context: AuthContext):
|
||||
"""Restricts document statements to caller-owned rows for non-admin users."""
|
||||
|
||||
if auth_context.role == UserRole.ADMIN:
|
||||
return statement
|
||||
return statement.where(Document.owner_user_id == auth_context.user_id)
|
||||
|
||||
|
||||
def _ensure_document_access(document: Document, auth_context: AuthContext) -> None:
|
||||
"""Enforces owner-level access for non-admin users and raises not-found on violations."""
|
||||
|
||||
if auth_context.role == UserRole.ADMIN:
|
||||
return
|
||||
if document.owner_user_id != auth_context.user_id:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
|
||||
def _stream_binary_file_chunks(handle: BinaryIO, *, chunk_bytes: int) -> Iterator[bytes]:
|
||||
"""Streams binary file-like content in bounded chunks and closes handle after completion."""
|
||||
|
||||
try:
|
||||
while True:
|
||||
chunk = handle.read(chunk_bytes)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
finally:
|
||||
handle.close()
|
||||
|
||||
|
||||
def _enforce_content_export_rate_limit(auth_context: AuthContext) -> None:
|
||||
"""Applies per-user fixed-window rate limiting for markdown export requests."""
|
||||
|
||||
try:
|
||||
current_count, limit = increment_rate_limit(
|
||||
scope="content-md-export",
|
||||
subject=str(auth_context.user_id),
|
||||
limit=settings.content_export_rate_limit_per_minute,
|
||||
window_seconds=60,
|
||||
)
|
||||
except RuntimeError as error:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Rate limiter backend unavailable",
|
||||
) from error
|
||||
|
||||
if limit > 0 and current_count > limit:
|
||||
raise HTTPException(
|
||||
status_code=429,
|
||||
detail=f"Export rate limit exceeded ({limit} requests per minute)",
|
||||
)
|
||||
|
||||
|
||||
def _parse_csv(value: str | None) -> list[str]:
|
||||
"""Parses comma-separated query values into a normalized non-empty list."""
|
||||
|
||||
@@ -296,6 +352,7 @@ def list_documents(
|
||||
type_filter: str | None = Query(default=None),
|
||||
processed_from: str | None = Query(default=None),
|
||||
processed_to: str | None = Query(default=None),
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> DocumentsListResponse:
|
||||
"""Returns paginated documents ordered by newest upload timestamp."""
|
||||
@@ -305,6 +362,7 @@ def list_documents(
|
||||
include_trashed=include_trashed,
|
||||
path_prefix=path_prefix,
|
||||
)
|
||||
base_statement = _scope_document_statement_for_auth_context(base_statement, auth_context)
|
||||
base_statement = _apply_discovery_filters(
|
||||
base_statement,
|
||||
path_filter=path_filter,
|
||||
@@ -326,11 +384,13 @@ def list_documents(
|
||||
@router.get("/tags")
|
||||
def list_tags(
|
||||
include_trashed: bool = Query(default=False),
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> dict[str, list[str]]:
|
||||
"""Returns distinct tags currently assigned across all matching documents."""
|
||||
|
||||
statement = select(Document.tags)
|
||||
statement = _scope_document_statement_for_auth_context(statement, auth_context)
|
||||
if not include_trashed:
|
||||
statement = statement.where(Document.status != DocumentStatus.TRASHED)
|
||||
|
||||
@@ -348,11 +408,13 @@ def list_tags(
|
||||
@router.get("/paths")
|
||||
def list_paths(
|
||||
include_trashed: bool = Query(default=False),
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> dict[str, list[str]]:
|
||||
"""Returns distinct logical paths currently assigned across all matching documents."""
|
||||
|
||||
statement = select(Document.logical_path)
|
||||
statement = _scope_document_statement_for_auth_context(statement, auth_context)
|
||||
if not include_trashed:
|
||||
statement = statement.where(Document.status != DocumentStatus.TRASHED)
|
||||
|
||||
@@ -370,11 +432,13 @@ def list_paths(
|
||||
@router.get("/types")
|
||||
def list_types(
|
||||
include_trashed: bool = Query(default=False),
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> dict[str, list[str]]:
|
||||
"""Returns distinct document type values from extension, MIME, and image text type."""
|
||||
|
||||
statement = select(Document.extension, Document.mime_type, Document.image_text_type)
|
||||
statement = _scope_document_statement_for_auth_context(statement, auth_context)
|
||||
if not include_trashed:
|
||||
statement = statement.where(Document.status != DocumentStatus.TRASHED)
|
||||
rows = session.execute(statement).all()
|
||||
@@ -390,16 +454,20 @@ def list_types(
|
||||
@router.post("/content-md/export")
|
||||
def export_contents_markdown(
|
||||
payload: ContentExportRequest,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> StreamingResponse:
|
||||
"""Exports extracted contents for selected documents as individual markdown files in a ZIP archive."""
|
||||
|
||||
_enforce_content_export_rate_limit(auth_context)
|
||||
|
||||
has_document_ids = len(payload.document_ids) > 0
|
||||
has_path_prefix = bool(payload.path_prefix and payload.path_prefix.strip())
|
||||
if not has_document_ids and not has_path_prefix:
|
||||
raise HTTPException(status_code=400, detail="Provide document_ids or path_prefix for export")
|
||||
|
||||
statement = select(Document)
|
||||
statement = _scope_document_statement_for_auth_context(statement, auth_context)
|
||||
if has_document_ids:
|
||||
statement = statement.where(Document.id.in_(payload.document_ids))
|
||||
if has_path_prefix:
|
||||
@@ -409,37 +477,82 @@ def export_contents_markdown(
|
||||
elif not payload.include_trashed:
|
||||
statement = statement.where(Document.status != DocumentStatus.TRASHED)
|
||||
|
||||
documents = session.execute(statement.order_by(Document.logical_path.asc(), Document.created_at.asc())).scalars().all()
|
||||
max_documents = max(1, int(settings.content_export_max_documents))
|
||||
ordered_statement = statement.order_by(Document.logical_path.asc(), Document.created_at.asc()).limit(max_documents + 1)
|
||||
documents = session.execute(ordered_statement).scalars().all()
|
||||
if len(documents) > max_documents:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"Export exceeds maximum document count ({len(documents)} > {max_documents})",
|
||||
)
|
||||
if not documents:
|
||||
raise HTTPException(status_code=404, detail="No matching documents found for export")
|
||||
|
||||
archive_buffer = io.BytesIO()
|
||||
max_total_bytes = max(1, int(settings.content_export_max_total_bytes))
|
||||
max_spool_memory = max(64 * 1024, int(settings.content_export_spool_max_memory_bytes))
|
||||
archive_file = tempfile.SpooledTemporaryFile(max_size=max_spool_memory, mode="w+b")
|
||||
total_export_bytes = 0
|
||||
used_entries: set[str] = set()
|
||||
with zipfile.ZipFile(archive_buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
for document in documents:
|
||||
entry_name = _zip_entry_name(document, used_entries)
|
||||
archive.writestr(entry_name, _markdown_for_document(document))
|
||||
try:
|
||||
with zipfile.ZipFile(archive_file, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
for document in documents:
|
||||
markdown_bytes = _markdown_for_document(document).encode("utf-8")
|
||||
total_export_bytes += len(markdown_bytes)
|
||||
if total_export_bytes > max_total_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=(
|
||||
"Export exceeds total markdown size limit "
|
||||
f"({total_export_bytes} > {max_total_bytes} bytes)"
|
||||
),
|
||||
)
|
||||
entry_name = _zip_entry_name(document, used_entries)
|
||||
archive.writestr(entry_name, markdown_bytes)
|
||||
archive_file.seek(0)
|
||||
except Exception:
|
||||
archive_file.close()
|
||||
raise
|
||||
|
||||
archive_buffer.seek(0)
|
||||
chunk_bytes = max(4 * 1024, int(settings.content_export_stream_chunk_bytes))
|
||||
headers = {"Content-Disposition": 'attachment; filename="document-contents-md.zip"'}
|
||||
return StreamingResponse(archive_buffer, media_type="application/zip", headers=headers)
|
||||
return StreamingResponse(
|
||||
_stream_binary_file_chunks(archive_file, chunk_bytes=chunk_bytes),
|
||||
media_type="application/zip",
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
|
||||
@router.get("/{document_id}", response_model=DocumentDetailResponse)
|
||||
def get_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentDetailResponse:
|
||||
def get_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> DocumentDetailResponse:
|
||||
"""Returns one document by unique identifier."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
return DocumentDetailResponse.model_validate(document)
|
||||
|
||||
|
||||
@router.get("/{document_id}/download")
|
||||
def download_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
|
||||
def download_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> FileResponse:
|
||||
"""Downloads original document bytes for the requested document identifier."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
file_path = absolute_path(document.stored_relative_path)
|
||||
@@ -447,10 +560,18 @@ def download_document(document_id: UUID, session: Session = Depends(get_session)
|
||||
|
||||
|
||||
@router.get("/{document_id}/preview")
|
||||
def preview_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
|
||||
def preview_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> FileResponse:
|
||||
"""Streams trusted-safe MIME types inline and forces attachment for active script-capable types."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
@@ -467,10 +588,18 @@ def preview_document(document_id: UUID, session: Session = Depends(get_session))
|
||||
|
||||
|
||||
@router.get("/{document_id}/thumbnail")
|
||||
def thumbnail_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
|
||||
def thumbnail_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> FileResponse:
|
||||
"""Returns a generated thumbnail image for dashboard card previews."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
@@ -485,10 +614,18 @@ def thumbnail_document(document_id: UUID, session: Session = Depends(get_session
|
||||
|
||||
|
||||
@router.get("/{document_id}/content-md")
|
||||
def download_document_content_markdown(document_id: UUID, session: Session = Depends(get_session)) -> Response:
|
||||
def download_document_content_markdown(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> Response:
|
||||
"""Downloads extracted content for one document as a markdown file."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
@@ -505,6 +642,7 @@ async def upload_documents(
|
||||
logical_path: Annotated[str, Form()] = "Inbox",
|
||||
tags: Annotated[str | None, Form()] = None,
|
||||
conflict_mode: Annotated[Literal["ask", "replace", "duplicate"], Form()] = "ask",
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> UploadResponse:
|
||||
"""Uploads files, records metadata, and enqueues asynchronous extraction tasks."""
|
||||
@@ -562,7 +700,11 @@ async def upload_documents(
|
||||
}
|
||||
)
|
||||
|
||||
existing = session.execute(select(Document).where(Document.sha256 == sha256)).scalar_one_or_none()
|
||||
existing_statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.sha256 == sha256),
|
||||
auth_context,
|
||||
)
|
||||
existing = session.execute(existing_statement).scalar_one_or_none()
|
||||
if existing and conflict_mode == "ask":
|
||||
log_processing_event(
|
||||
session=session,
|
||||
@@ -589,9 +731,11 @@ async def upload_documents(
|
||||
return UploadResponse(uploaded=[], conflicts=conflicts)
|
||||
|
||||
for prepared in prepared_uploads:
|
||||
existing = session.execute(
|
||||
select(Document).where(Document.sha256 == str(prepared["sha256"]))
|
||||
).scalar_one_or_none()
|
||||
existing_statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.sha256 == str(prepared["sha256"])),
|
||||
auth_context,
|
||||
)
|
||||
existing = session.execute(existing_statement).scalar_one_or_none()
|
||||
replaces_document_id = existing.id if existing and conflict_mode == "replace" else None
|
||||
|
||||
stored_relative_path = store_bytes(str(prepared["filename"]), bytes(prepared["data"]))
|
||||
@@ -606,6 +750,7 @@ async def upload_documents(
|
||||
size_bytes=len(bytes(prepared["data"])),
|
||||
logical_path=logical_path,
|
||||
tags=list(normalized_tags),
|
||||
owner_user_id=auth_context.user_id,
|
||||
replaces_document_id=replaces_document_id,
|
||||
metadata_json={"upload": "web"},
|
||||
)
|
||||
@@ -637,11 +782,16 @@ async def upload_documents(
|
||||
def update_document(
|
||||
document_id: UUID,
|
||||
payload: DocumentUpdateRequest,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> DocumentResponse:
|
||||
"""Updates document metadata and refreshes semantic index representation."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
@@ -663,10 +813,18 @@ def update_document(
|
||||
|
||||
|
||||
@router.post("/{document_id}/trash", response_model=DocumentResponse)
|
||||
def trash_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
|
||||
def trash_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> DocumentResponse:
|
||||
"""Marks a document as trashed without deleting files from storage."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
@@ -687,10 +845,18 @@ def trash_document(document_id: UUID, session: Session = Depends(get_session)) -
|
||||
|
||||
|
||||
@router.post("/{document_id}/restore", response_model=DocumentResponse)
|
||||
def restore_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
|
||||
def restore_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> DocumentResponse:
|
||||
"""Restores a trashed document to its previous lifecycle status."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
@@ -712,16 +878,27 @@ def restore_document(document_id: UUID, session: Session = Depends(get_session))
|
||||
|
||||
|
||||
@router.delete("/{document_id}")
|
||||
def delete_document(document_id: UUID, session: Session = Depends(get_session)) -> dict[str, int]:
|
||||
def delete_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> dict[str, int]:
|
||||
"""Permanently deletes a document and all descendant archive members including stored files."""
|
||||
|
||||
root = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
root_statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
root = session.execute(root_statement).scalar_one_or_none()
|
||||
if root is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
if root.status != DocumentStatus.TRASHED:
|
||||
raise HTTPException(status_code=400, detail="Move document to trash before permanent deletion")
|
||||
|
||||
document_tree = _collect_document_tree(session=session, root_document_id=document_id)
|
||||
if auth_context.role != UserRole.ADMIN:
|
||||
for _, document in document_tree:
|
||||
_ensure_document_access(document, auth_context)
|
||||
document_ids = [document.id for _, document in document_tree]
|
||||
try:
|
||||
delete_many_documents_index([str(current_id) for current_id in document_ids])
|
||||
@@ -752,10 +929,18 @@ def delete_document(document_id: UUID, session: Session = Depends(get_session))
|
||||
|
||||
|
||||
@router.post("/{document_id}/reprocess", response_model=DocumentResponse)
|
||||
def reprocess_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
|
||||
def reprocess_document(
|
||||
document_id: UUID,
|
||||
auth_context: AuthContext = Depends(require_user_or_admin),
|
||||
session: Session = Depends(get_session),
|
||||
) -> DocumentResponse:
|
||||
"""Re-enqueues a document for extraction and suggestion processing."""
|
||||
|
||||
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
|
||||
statement = _scope_document_statement_for_auth_context(
|
||||
select(Document).where(Document.id == document_id),
|
||||
auth_context,
|
||||
)
|
||||
document = session.execute(statement).scalar_one_or_none()
|
||||
if document is None:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
if document.status == DocumentStatus.TRASHED:
|
||||
|
||||
Reference in New Issue
Block a user