Harden auth, redaction, upload size checks, and compose token requirements

This commit is contained in:
2026-02-21 13:48:55 -03:00
parent 5792586a90
commit 3cbad053cc
21 changed files with 1168 additions and 85 deletions

87
backend/app/api/auth.py Normal file
View File

@@ -0,0 +1,87 @@
"""Token-based authentication and authorization dependencies for privileged API routes."""
import hmac
from typing import Annotated
from fastapi import Depends, HTTPException, status
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from app.core.config import Settings, get_settings
bearer_auth = HTTPBearer(auto_error=False)
class AuthRole:
"""Declares supported authorization roles for privileged API operations."""
ADMIN = "admin"
USER = "user"
def _raise_unauthorized() -> None:
"""Raises an HTTP 401 response with bearer authentication challenge headers."""
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid or missing API token",
headers={"WWW-Authenticate": "Bearer"},
)
def _configured_admin_token(settings: Settings) -> str:
"""Returns required admin token or raises configuration error when unset."""
token = settings.admin_api_token.strip()
if token:
return token
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Admin API token is not configured",
)
def _resolve_token_role(token: str, settings: Settings) -> str:
"""Resolves role from a bearer token using constant-time comparisons."""
admin_token = _configured_admin_token(settings)
if hmac.compare_digest(token, admin_token):
return AuthRole.ADMIN
user_token = settings.user_api_token.strip()
if user_token and hmac.compare_digest(token, user_token):
return AuthRole.USER
_raise_unauthorized()
def get_request_role(
credentials: Annotated[HTTPAuthorizationCredentials | None, Depends(bearer_auth)],
settings: Annotated[Settings, Depends(get_settings)],
) -> str:
"""Authenticates request token and returns its authorization role."""
if credentials is None:
_raise_unauthorized()
token = credentials.credentials.strip()
if not token:
_raise_unauthorized()
return _resolve_token_role(token=token, settings=settings)
def require_user_or_admin(role: Annotated[str, Depends(get_request_role)]) -> str:
"""Requires a valid user or admin token and returns resolved role."""
return role
def require_admin(role: Annotated[str, Depends(get_request_role)]) -> str:
"""Requires admin role and rejects requests authenticated as regular users."""
if role != AuthRole.ADMIN:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Admin token required",
)
return role

View File

@@ -1,7 +1,8 @@
"""API router registration for all HTTP route modules."""
from fastapi import APIRouter
from fastapi import APIRouter, Depends
from app.api.auth import require_admin, require_user_or_admin
from app.api.routes_documents import router as documents_router
from app.api.routes_health import router as health_router
from app.api.routes_processing_logs import router as processing_logs_router
@@ -11,7 +12,27 @@ from app.api.routes_settings import router as settings_router
api_router = APIRouter()
api_router.include_router(health_router)
api_router.include_router(documents_router, prefix="/documents", tags=["documents"])
api_router.include_router(processing_logs_router, prefix="/processing/logs", tags=["processing-logs"])
api_router.include_router(search_router, prefix="/search", tags=["search"])
api_router.include_router(settings_router, prefix="/settings", tags=["settings"])
api_router.include_router(
documents_router,
prefix="/documents",
tags=["documents"],
dependencies=[Depends(require_user_or_admin)],
)
api_router.include_router(
processing_logs_router,
prefix="/processing/logs",
tags=["processing-logs"],
dependencies=[Depends(require_admin)],
)
api_router.include_router(
search_router,
prefix="/search",
tags=["search"],
dependencies=[Depends(require_user_or_admin)],
)
api_router.include_router(
settings_router,
prefix="/settings",
tags=["settings"],
dependencies=[Depends(require_admin)],
)

View File

@@ -1,4 +1,4 @@
"""Document CRUD, lifecycle, metadata, file access, and content export endpoints."""
"""Authenticated document CRUD, lifecycle, metadata, file access, and content export endpoints."""
import io
import re
@@ -14,7 +14,7 @@ from fastapi.responses import FileResponse, Response, StreamingResponse
from sqlalchemy import or_, func, select
from sqlalchemy.orm import Session
from app.services.app_settings import read_predefined_paths_settings, read_predefined_tags_settings
from app.core.config import get_settings
from app.db.base import get_session
from app.models.document import Document, DocumentStatus
from app.schemas.documents import (
@@ -26,6 +26,7 @@ from app.schemas.documents import (
UploadConflict,
UploadResponse,
)
from app.services.app_settings import read_predefined_paths_settings, read_predefined_tags_settings
from app.services.extractor import sniff_mime
from app.services.handwriting_style import delete_many_handwriting_style_documents
from app.services.processing_logs import log_processing_event, set_processing_log_autocommit
@@ -35,6 +36,7 @@ from app.worker.queue import get_processing_queue
router = APIRouter()
settings = get_settings()
def _parse_csv(value: str | None) -> list[str]:
@@ -227,6 +229,33 @@ def _build_document_list_statement(
return statement
def _enforce_upload_shape(files: list[UploadFile]) -> None:
"""Validates upload request shape against configured file-count bounds."""
if not files:
raise HTTPException(status_code=400, detail="Upload request must include at least one file")
if len(files) > settings.max_upload_files_per_request:
raise HTTPException(
status_code=413,
detail=(
"Upload request exceeds file count limit "
f"({len(files)} > {settings.max_upload_files_per_request})"
),
)
async def _read_upload_bytes(file: UploadFile, max_bytes: int) -> bytes:
"""Reads one upload file while enforcing per-file byte limits."""
data = await file.read(max_bytes + 1)
if len(data) > max_bytes:
raise HTTPException(
status_code=413,
detail=f"File '{file.filename or 'upload'}' exceeds per-file limit of {max_bytes} bytes",
)
return data
def _collect_document_tree(session: Session, root_document_id: UUID) -> list[tuple[int, Document]]:
"""Collects a document and all descendants for recursive permanent deletion."""
@@ -472,18 +501,29 @@ async def upload_documents(
) -> UploadResponse:
"""Uploads files, records metadata, and enqueues asynchronous extraction tasks."""
_enforce_upload_shape(files)
set_processing_log_autocommit(session, True)
normalized_tags = _normalize_tags(tags)
queue = get_processing_queue()
uploaded: list[DocumentResponse] = []
conflicts: list[UploadConflict] = []
total_request_bytes = 0
indexed_relative_paths = relative_paths or []
prepared_uploads: list[dict[str, object]] = []
for idx, file in enumerate(files):
filename = file.filename or f"uploaded_{idx}"
data = await file.read()
data = await _read_upload_bytes(file, settings.max_upload_file_size_bytes)
total_request_bytes += len(data)
if total_request_bytes > settings.max_upload_request_size_bytes:
raise HTTPException(
status_code=413,
detail=(
"Upload request exceeds total size limit "
f"({total_request_bytes} > {settings.max_upload_request_size_bytes} bytes)"
),
)
sha256 = compute_sha256(data)
source_relative_path = indexed_relative_paths[idx] if idx < len(indexed_relative_paths) else filename
extension = Path(filename).suffix.lower()

View File

@@ -1,10 +1,11 @@
"""Read-only API endpoints for processing pipeline event logs."""
"""Admin-only API endpoints for processing pipeline event logs."""
from uuid import UUID
from fastapi import APIRouter, Depends, Query
from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.db.base import get_session
from app.schemas.processing_logs import ProcessingLogEntryResponse, ProcessingLogListResponse
from app.services.app_settings import read_processing_log_retention_settings
@@ -17,12 +18,13 @@ from app.services.processing_logs import (
router = APIRouter()
settings = get_settings()
@router.get("", response_model=ProcessingLogListResponse)
def get_processing_logs(
offset: int = Query(default=0, ge=0),
limit: int = Query(default=120, ge=1, le=400),
limit: int = Query(default=120, ge=1, le=settings.processing_log_max_unbound_entries),
document_id: UUID | None = Query(default=None),
session: Session = Depends(get_session),
) -> ProcessingLogListResponse:
@@ -43,8 +45,8 @@ def get_processing_logs(
@router.post("/trim")
def trim_processing_logs(
keep_document_sessions: int | None = Query(default=None, ge=0, le=20),
keep_unbound_entries: int | None = Query(default=None, ge=0, le=400),
keep_document_sessions: int | None = Query(default=None, ge=0, le=settings.processing_log_max_document_sessions),
keep_unbound_entries: int | None = Query(default=None, ge=0, le=settings.processing_log_max_unbound_entries),
session: Session = Depends(get_session),
) -> dict[str, int]:
"""Deletes old processing logs using query values or persisted retention defaults."""
@@ -61,10 +63,19 @@ def trim_processing_logs(
else int(retention_defaults.get("keep_unbound_entries", 80))
)
capped_keep_document_sessions = min(
settings.processing_log_max_document_sessions,
max(0, int(resolved_keep_document_sessions)),
)
capped_keep_unbound_entries = min(
settings.processing_log_max_unbound_entries,
max(0, int(resolved_keep_unbound_entries)),
)
result = cleanup_processing_logs(
session=session,
keep_document_sessions=resolved_keep_document_sessions,
keep_unbound_entries=resolved_keep_unbound_entries,
keep_document_sessions=capped_keep_document_sessions,
keep_unbound_entries=capped_keep_unbound_entries,
)
session.commit()
return result

View File

@@ -1,6 +1,6 @@
"""API routes for managing persistent single-user application settings."""
"""Admin-only API routes for managing persistent single-user application settings."""
from fastapi import APIRouter
from fastapi import APIRouter, HTTPException
from app.schemas.settings import (
AppSettingsUpdateRequest,
@@ -18,6 +18,7 @@ from app.schemas.settings import (
UploadDefaultsResponse,
)
from app.services.app_settings import (
AppSettingsValidationError,
TASK_OCR_HANDWRITING,
TASK_ROUTING_CLASSIFICATION,
TASK_SUMMARY_GENERATION,
@@ -179,16 +180,19 @@ def set_app_settings(payload: AppSettingsUpdateRequest) -> AppSettingsResponse:
if payload.predefined_tags is not None:
predefined_tags_payload = [item.model_dump(exclude_none=True) for item in payload.predefined_tags]
updated = update_app_settings(
providers=providers_payload,
tasks=tasks_payload,
upload_defaults=upload_defaults_payload,
display=display_payload,
processing_log_retention=processing_log_retention_payload,
handwriting_style=handwriting_style_payload,
predefined_paths=predefined_paths_payload,
predefined_tags=predefined_tags_payload,
)
try:
updated = update_app_settings(
providers=providers_payload,
tasks=tasks_payload,
upload_defaults=upload_defaults_payload,
display=display_payload,
processing_log_retention=processing_log_retention_payload,
handwriting_style=handwriting_style_payload,
predefined_paths=predefined_paths_payload,
predefined_tags=predefined_tags_payload,
)
except AppSettingsValidationError as error:
raise HTTPException(status_code=400, detail=str(error)) from error
return _build_response(updated)
@@ -203,14 +207,17 @@ def reset_settings_to_defaults() -> AppSettingsResponse:
def set_handwriting_settings(payload: HandwritingSettingsUpdateRequest) -> AppSettingsResponse:
"""Updates handwriting transcription settings and returns the resulting configuration."""
updated = update_handwriting_settings(
enabled=payload.enabled,
openai_base_url=payload.openai_base_url,
openai_model=payload.openai_model,
openai_timeout_seconds=payload.openai_timeout_seconds,
openai_api_key=payload.openai_api_key,
clear_openai_api_key=payload.clear_openai_api_key,
)
try:
updated = update_handwriting_settings(
enabled=payload.enabled,
openai_base_url=payload.openai_base_url,
openai_model=payload.openai_model,
openai_timeout_seconds=payload.openai_timeout_seconds,
openai_api_key=payload.openai_api_key,
clear_openai_api_key=payload.clear_openai_api_key,
)
except AppSettingsValidationError as error:
raise HTTPException(status_code=400, detail=str(error)) from error
return _build_response(updated)