Harden security controls from REPORT findings

This commit is contained in:
2026-03-01 13:32:08 -03:00
parent da5cbc2c01
commit bdd97d1c62
20 changed files with 1455 additions and 97 deletions

View File

@@ -24,9 +24,9 @@ The default `docker compose` stack includes:
- `frontend` - React UI (`http://localhost:5173`)
- `api` - FastAPI backend (`http://localhost:8000`, docs at `/docs`)
- `worker` - background processing jobs
- `db` - PostgreSQL (`localhost:5432`)
- `redis` - queue backend (`localhost:6379`)
- `typesense` - search index (`localhost:8108`)
- `db` - PostgreSQL (internal service network)
- `redis` - queue backend (internal service network)
- `typesense` - search index (internal service network)
## Requirements
@@ -42,6 +42,19 @@ From repository root:
docker compose up --build -d
```
Before first run, set required secrets and connection values in `.env` (or your shell):
- `POSTGRES_USER`
- `POSTGRES_PASSWORD`
- `POSTGRES_DB`
- `DATABASE_URL`
- `REDIS_PASSWORD`
- `REDIS_URL`
- `ADMIN_API_TOKEN`
- `USER_API_TOKEN`
- `APP_SETTINGS_ENCRYPTION_KEY`
- `TYPESENSE_API_KEY`
Open:
- Frontend: `http://localhost:5173`
@@ -102,13 +115,15 @@ cd frontend && npm run preview
Main runtime variables are defined in `docker-compose.yml`:
- API and worker: `DATABASE_URL`, `REDIS_URL`, `STORAGE_ROOT`, `PUBLIC_BASE_URL`, `CORS_ORIGINS`, `TYPESENSE_*`
- Frontend: `VITE_API_BASE`
- API and worker: `DATABASE_URL`, `REDIS_URL`, `REDIS_SECURITY_MODE`, `REDIS_TLS_MODE`, `STORAGE_ROOT`, `PUBLIC_BASE_URL`, `CORS_ORIGINS`, `ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS`, `TYPESENSE_*`, `APP_SETTINGS_ENCRYPTION_KEY`
- Frontend: `VITE_API_BASE`, optional `VITE_API_TOKEN` compatibility fallback
Application settings saved from the UI persist at:
- `<STORAGE_ROOT>/settings.json` (inside the storage volume)
Provider API keys are persisted encrypted at rest (`api_key_encrypted`) and are no longer written as plaintext values.
Settings endpoints:
- `GET/PUT /api/v1/settings`

View File

@@ -1,15 +1,20 @@
APP_ENV=development
DATABASE_URL=postgresql+psycopg://dcm:dcm@db:5432/dcm
REDIS_URL=redis://redis:6379/0
REDIS_URL=redis://:replace-with-redis-password@redis:6379/0
REDIS_SECURITY_MODE=auto
REDIS_TLS_MODE=auto
ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS=true
STORAGE_ROOT=/data/storage
ADMIN_API_TOKEN=replace-with-random-admin-token
USER_API_TOKEN=replace-with-random-user-token
APP_SETTINGS_ENCRYPTION_KEY=replace-with-random-settings-encryption-key
MAX_UPLOAD_FILES_PER_REQUEST=50
MAX_UPLOAD_FILE_SIZE_BYTES=26214400
MAX_UPLOAD_REQUEST_SIZE_BYTES=104857600
MAX_ZIP_MEMBER_UNCOMPRESSED_BYTES=26214400
MAX_ZIP_TOTAL_UNCOMPRESSED_BYTES=157286400
MAX_ZIP_COMPRESSION_RATIO=120
MAX_ZIP_DESCENDANTS_PER_ROOT=1000
PROVIDER_BASE_URL_ALLOWLIST=["api.openai.com"]
PROVIDER_BASE_URL_ALLOW_HTTP=false
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK=false
@@ -23,6 +28,6 @@ DEFAULT_ROUTING_MODEL=gpt-4.1-mini
TYPESENSE_PROTOCOL=http
TYPESENSE_HOST=typesense
TYPESENSE_PORT=8108
TYPESENSE_API_KEY=dcm-typesense-key
TYPESENSE_API_KEY=replace-with-random-typesense-api-key
TYPESENSE_COLLECTION_NAME=documents
PUBLIC_BASE_URL=http://localhost:8000

View File

@@ -59,13 +59,21 @@ def get_request_role(
credentials: Annotated[HTTPAuthorizationCredentials | None, Depends(bearer_auth)],
settings: Annotated[Settings, Depends(get_settings)],
) -> str:
"""Authenticates request token and returns its authorization role."""
"""Authenticates request token and returns its authorization role.
Development environments can optionally allow tokenless user access for non-admin routes to
preserve local workflow compatibility while production remains token-enforced.
"""
if credentials is None:
if settings.allow_development_anonymous_user_access and settings.app_env.strip().lower() in {"development", "dev"}:
return AuthRole.USER
_raise_unauthorized()
token = credentials.credentials.strip()
if not token:
if settings.allow_development_anonymous_user_access and settings.app_env.strip().lower() in {"development", "dev"}:
return AuthRole.USER
_raise_unauthorized()
return _resolve_token_role(token=token, settings=settings)

View File

@@ -14,7 +14,7 @@ from fastapi.responses import FileResponse, Response, StreamingResponse
from sqlalchemy import or_, func, select
from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.core.config import get_settings, is_inline_preview_mime_type_safe
from app.db.base import get_session
from app.models.document import Document, DocumentStatus
from app.schemas.documents import (
@@ -448,14 +448,22 @@ def download_document(document_id: UUID, session: Session = Depends(get_session)
@router.get("/{document_id}/preview")
def preview_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
"""Streams the original document inline when browser rendering is supported."""
"""Streams trusted-safe MIME types inline and forces attachment for active script-capable types."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
original_path = absolute_path(document.stored_relative_path)
return FileResponse(path=original_path, media_type=document.mime_type)
common_headers = {"X-Content-Type-Options": "nosniff"}
if not is_inline_preview_mime_type_safe(document.mime_type):
return FileResponse(
path=original_path,
filename=document.original_filename,
media_type="application/octet-stream",
headers=common_headers,
)
return FileResponse(path=original_path, media_type=document.mime_type, headers=common_headers)
@router.get("/{document_id}/thumbnail")

View File

@@ -19,6 +19,9 @@ class Settings(BaseSettings):
app_env: str = "development"
database_url: str = "postgresql+psycopg://dcm:dcm@db:5432/dcm"
redis_url: str = "redis://redis:6379/0"
redis_security_mode: str = "auto"
redis_tls_mode: str = "auto"
allow_development_anonymous_user_access: bool = True
storage_root: Path = Path("/data/storage")
upload_chunk_size: int = 4 * 1024 * 1024
max_upload_files_per_request: int = 50
@@ -26,6 +29,7 @@ class Settings(BaseSettings):
max_upload_request_size_bytes: int = 100 * 1024 * 1024
max_zip_members: int = 250
max_zip_depth: int = 2
max_zip_descendants_per_root: int = 1000
max_zip_member_uncompressed_bytes: int = 25 * 1024 * 1024
max_zip_total_uncompressed_bytes: int = 150 * 1024 * 1024
max_zip_compression_ratio: float = 120.0
@@ -44,12 +48,13 @@ class Settings(BaseSettings):
default_openai_timeout_seconds: int = 45
default_openai_handwriting_enabled: bool = True
default_openai_api_key: str = ""
app_settings_encryption_key: str = ""
default_summary_model: str = "gpt-4.1-mini"
default_routing_model: str = "gpt-4.1-mini"
typesense_protocol: str = "http"
typesense_host: str = "typesense"
typesense_port: int = 8108
typesense_api_key: str = "dcm-typesense-key"
typesense_api_key: str = ""
typesense_collection_name: str = "documents"
typesense_timeout_seconds: int = 120
typesense_num_retries: int = 0
@@ -58,6 +63,111 @@ class Settings(BaseSettings):
LOCAL_HOSTNAME_SUFFIXES = (".local", ".internal", ".home.arpa")
SCRIPT_CAPABLE_INLINE_MIME_TYPES = frozenset(
{
"application/ecmascript",
"application/javascript",
"application/x-javascript",
"application/xhtml+xml",
"image/svg+xml",
"text/ecmascript",
"text/html",
"text/javascript",
}
)
SCRIPT_CAPABLE_XML_MIME_TYPES = frozenset({"application/xml", "text/xml"})
REDIS_SECURITY_MODES = frozenset({"auto", "strict", "compat"})
REDIS_TLS_MODES = frozenset({"auto", "required", "allow_insecure"})
def _is_production_environment(app_env: str) -> bool:
"""Returns whether the runtime environment should enforce production-only security gates."""
normalized = app_env.strip().lower()
return normalized in {"production", "prod"}
def _normalize_redis_security_mode(raw_mode: str) -> str:
"""Normalizes Redis security mode values into one supported mode."""
normalized = raw_mode.strip().lower()
if normalized not in REDIS_SECURITY_MODES:
return "auto"
return normalized
def _normalize_redis_tls_mode(raw_mode: str) -> str:
"""Normalizes Redis TLS mode values into one supported mode."""
normalized = raw_mode.strip().lower()
if normalized not in REDIS_TLS_MODES:
return "auto"
return normalized
def validate_redis_url_security(
redis_url: str,
*,
app_env: str | None = None,
security_mode: str | None = None,
tls_mode: str | None = None,
) -> str:
"""Validates Redis URL security posture with production fail-closed defaults."""
settings = get_settings()
resolved_app_env = app_env if app_env is not None else settings.app_env
resolved_security_mode = (
_normalize_redis_security_mode(security_mode)
if security_mode is not None
else _normalize_redis_security_mode(settings.redis_security_mode)
)
resolved_tls_mode = (
_normalize_redis_tls_mode(tls_mode)
if tls_mode is not None
else _normalize_redis_tls_mode(settings.redis_tls_mode)
)
candidate = redis_url.strip()
if not candidate:
raise ValueError("Redis URL must not be empty")
parsed = urlparse(candidate)
scheme = parsed.scheme.lower()
if scheme not in {"redis", "rediss"}:
raise ValueError("Redis URL must use redis:// or rediss://")
if not parsed.hostname:
raise ValueError("Redis URL must include a hostname")
strict_security = (
resolved_security_mode == "strict"
or (resolved_security_mode == "auto" and _is_production_environment(resolved_app_env))
)
require_tls = (
resolved_tls_mode == "required"
or (resolved_tls_mode == "auto" and strict_security)
)
has_password = bool(parsed.password and parsed.password.strip())
uses_tls = scheme == "rediss"
if strict_security and not has_password:
raise ValueError("Redis URL must include authentication when security mode is strict")
if require_tls and not uses_tls:
raise ValueError("Redis URL must use rediss:// when TLS is required")
return candidate
def is_inline_preview_mime_type_safe(mime_type: str) -> bool:
"""Returns whether a MIME type is safe to serve inline from untrusted document uploads."""
normalized = mime_type.split(";", 1)[0].strip().lower() if mime_type else ""
if not normalized:
return False
if normalized in SCRIPT_CAPABLE_INLINE_MIME_TYPES:
return False
if normalized in SCRIPT_CAPABLE_XML_MIME_TYPES or normalized.endswith("+xml"):
return False
return True
def _normalize_allowlist(allowlist: object) -> tuple[str, ...]:

View File

@@ -1,7 +1,13 @@
"""Persistent single-user application settings service backed by host-mounted storage."""
import base64
import binascii
import hashlib
import hmac
import json
import os
import re
import secrets
from pathlib import Path
from typing import Any
@@ -57,6 +63,172 @@ DEFAULT_ROUTING_PROMPT = (
"Confidence must be between 0 and 1."
)
PROVIDER_API_KEY_CIPHERTEXT_PREFIX = "enc-v1"
PROVIDER_API_KEY_KEYFILE_NAME = ".settings-api-key"
PROVIDER_API_KEY_STREAM_CONTEXT = b"dcm-provider-api-key-stream"
PROVIDER_API_KEY_AUTH_CONTEXT = b"dcm-provider-api-key-auth"
PROVIDER_API_KEY_NONCE_BYTES = 16
PROVIDER_API_KEY_TAG_BYTES = 32
def _settings_api_key_path() -> Path:
"""Returns the storage path used for local symmetric encryption key persistence."""
return settings.storage_root / PROVIDER_API_KEY_KEYFILE_NAME
def _write_private_text_file(path: Path, content: str) -> None:
"""Writes text files with restrictive owner-only permissions for local secret material."""
path.parent.mkdir(parents=True, exist_ok=True)
file_descriptor = os.open(str(path), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
with os.fdopen(file_descriptor, "w", encoding="utf-8") as handle:
handle.write(content)
os.chmod(path, 0o600)
def _urlsafe_b64encode_no_padding(data: bytes) -> str:
"""Encodes bytes to URL-safe base64 without padding for compact JSON persistence."""
return base64.urlsafe_b64encode(data).decode("ascii").rstrip("=")
def _urlsafe_b64decode_no_padding(data: str) -> bytes:
"""Decodes URL-safe base64 values that may omit trailing padding characters."""
padded = data + "=" * (-len(data) % 4)
return base64.urlsafe_b64decode(padded.encode("ascii"))
def _derive_provider_api_key_key() -> bytes:
"""Resolves the master key used to encrypt provider API keys for settings storage."""
configured_key = settings.app_settings_encryption_key.strip()
if configured_key:
try:
decoded = _urlsafe_b64decode_no_padding(configured_key)
if len(decoded) >= 32:
return decoded[:32]
except (binascii.Error, ValueError):
pass
return hashlib.sha256(configured_key.encode("utf-8")).digest()
key_path = _settings_api_key_path()
if key_path.exists():
try:
persisted = key_path.read_text(encoding="utf-8").strip()
decoded = _urlsafe_b64decode_no_padding(persisted)
if len(decoded) >= 32:
return decoded[:32]
except (OSError, UnicodeDecodeError, binascii.Error, ValueError):
pass
generated = secrets.token_bytes(32)
_write_private_text_file(key_path, _urlsafe_b64encode_no_padding(generated))
return generated
def _xor_bytes(left: bytes, right: bytes) -> bytes:
"""Applies byte-wise XOR for equal-length byte sequences."""
return bytes(first ^ second for first, second in zip(left, right))
def _derive_stream_cipher_bytes(master_key: bytes, nonce: bytes, length: int) -> bytes:
"""Derives deterministic stream bytes from HMAC-SHA256 blocks for payload masking."""
stream = bytearray()
counter = 0
while len(stream) < length:
counter_bytes = counter.to_bytes(4, "big")
block = hmac.new(
master_key,
PROVIDER_API_KEY_STREAM_CONTEXT + nonce + counter_bytes,
hashlib.sha256,
).digest()
stream.extend(block)
counter += 1
return bytes(stream[:length])
def _encrypt_provider_api_key(value: str) -> str:
"""Encrypts one provider API key for at-rest JSON persistence."""
normalized = value.strip()
if not normalized:
return ""
plaintext = normalized.encode("utf-8")
master_key = _derive_provider_api_key_key()
nonce = secrets.token_bytes(PROVIDER_API_KEY_NONCE_BYTES)
keystream = _derive_stream_cipher_bytes(master_key, nonce, len(plaintext))
ciphertext = _xor_bytes(plaintext, keystream)
tag = hmac.new(
master_key,
PROVIDER_API_KEY_AUTH_CONTEXT + nonce + ciphertext,
hashlib.sha256,
).digest()
payload = nonce + ciphertext + tag
encoded = _urlsafe_b64encode_no_padding(payload)
return f"{PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:{encoded}"
def _decrypt_provider_api_key(value: str) -> str:
"""Decrypts provider API key ciphertext while rejecting tampered payloads."""
normalized = value.strip()
if not normalized:
return ""
if not normalized.startswith(f"{PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:"):
return normalized
encoded_payload = normalized.split(":", 1)[1]
if not encoded_payload:
raise AppSettingsValidationError("Provider API key ciphertext is missing payload bytes")
try:
payload = _urlsafe_b64decode_no_padding(encoded_payload)
except (binascii.Error, ValueError) as error:
raise AppSettingsValidationError("Provider API key ciphertext is not valid base64") from error
minimum_length = PROVIDER_API_KEY_NONCE_BYTES + PROVIDER_API_KEY_TAG_BYTES
if len(payload) < minimum_length:
raise AppSettingsValidationError("Provider API key ciphertext payload is truncated")
nonce = payload[:PROVIDER_API_KEY_NONCE_BYTES]
ciphertext = payload[PROVIDER_API_KEY_NONCE_BYTES:-PROVIDER_API_KEY_TAG_BYTES]
received_tag = payload[-PROVIDER_API_KEY_TAG_BYTES:]
master_key = _derive_provider_api_key_key()
expected_tag = hmac.new(
master_key,
PROVIDER_API_KEY_AUTH_CONTEXT + nonce + ciphertext,
hashlib.sha256,
).digest()
if not hmac.compare_digest(received_tag, expected_tag):
raise AppSettingsValidationError("Provider API key ciphertext integrity check failed")
keystream = _derive_stream_cipher_bytes(master_key, nonce, len(ciphertext))
plaintext = _xor_bytes(ciphertext, keystream)
try:
return plaintext.decode("utf-8").strip()
except UnicodeDecodeError as error:
raise AppSettingsValidationError("Provider API key ciphertext is not valid UTF-8") from error
def _read_provider_api_key(provider_payload: dict[str, Any]) -> str:
"""Reads provider API key values from encrypted or legacy plaintext settings payloads."""
encrypted_value = provider_payload.get("api_key_encrypted")
if isinstance(encrypted_value, str) and encrypted_value.strip():
try:
return _decrypt_provider_api_key(encrypted_value)
except AppSettingsValidationError:
return ""
plaintext_value = provider_payload.get("api_key")
if plaintext_value is None:
return ""
return str(plaintext_value).strip()
def _default_settings() -> dict[str, Any]:
"""Builds default settings including providers and model task bindings."""
@@ -243,8 +415,17 @@ def _normalize_provider(
if provider_type != "openai_compatible":
provider_type = "openai_compatible"
api_key_value = payload.get("api_key", fallback_values.get("api_key", defaults["api_key"]))
api_key = str(api_key_value).strip() if api_key_value is not None else ""
payload_api_key = _read_provider_api_key(payload)
fallback_api_key = _read_provider_api_key(fallback_values)
default_api_key = _read_provider_api_key(defaults)
if "api_key" in payload and payload.get("api_key") is not None:
api_key = str(payload.get("api_key")).strip()
elif payload_api_key:
api_key = payload_api_key
elif fallback_api_key:
api_key = fallback_api_key
else:
api_key = default_api_key
raw_base_url = str(payload.get("base_url", fallback_values.get("base_url", defaults["base_url"]))).strip()
if not raw_base_url:
@@ -266,6 +447,7 @@ def _normalize_provider(
)
),
"api_key": api_key,
"api_key_encrypted": _encrypt_provider_api_key(api_key),
}
@@ -653,6 +835,26 @@ def _sanitize_settings(payload: dict[str, Any]) -> dict[str, Any]:
}
def _serialize_settings_for_storage(payload: dict[str, Any]) -> dict[str, Any]:
"""Converts sanitized runtime payload into storage-safe form without plaintext provider keys."""
storage_payload = dict(payload)
providers_storage: list[dict[str, Any]] = []
for provider in payload.get("providers", []):
if not isinstance(provider, dict):
continue
provider_storage = dict(provider)
plaintext_api_key = str(provider_storage.pop("api_key", "")).strip()
encrypted_api_key = str(provider_storage.get("api_key_encrypted", "")).strip()
if plaintext_api_key:
encrypted_api_key = _encrypt_provider_api_key(plaintext_api_key)
provider_storage["api_key_encrypted"] = encrypted_api_key
providers_storage.append(provider_storage)
storage_payload["providers"] = providers_storage
return storage_payload
def ensure_app_settings() -> None:
"""Creates a settings file with defaults when no persisted settings are present."""
@@ -662,7 +864,7 @@ def ensure_app_settings() -> None:
return
defaults = _sanitize_settings(_default_settings())
path.write_text(json.dumps(defaults, indent=2), encoding="utf-8")
_write_private_text_file(path, json.dumps(_serialize_settings_for_storage(defaults), indent=2))
def _read_raw_settings() -> dict[str, Any]:
@@ -682,7 +884,8 @@ def _write_settings(payload: dict[str, Any]) -> None:
path = _settings_path()
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
storage_payload = _serialize_settings_for_storage(payload)
_write_private_text_file(path, json.dumps(storage_payload, indent=2))
def read_app_settings() -> dict[str, Any]:
@@ -879,16 +1082,21 @@ def update_app_settings(
def read_handwriting_provider_settings() -> dict[str, Any]:
"""Returns OCR settings in legacy shape for the handwriting transcription service."""
"""Returns OCR settings in legacy shape with DNS-revalidated provider base URL safety checks."""
runtime = read_task_runtime_settings(TASK_OCR_HANDWRITING)
provider = runtime["provider"]
task = runtime["task"]
raw_base_url = str(provider.get("base_url", settings.default_openai_base_url))
try:
normalized_base_url = normalize_and_validate_provider_base_url(raw_base_url, resolve_dns=True)
except ValueError as error:
raise AppSettingsValidationError(str(error)) from error
return {
"provider": provider["provider_type"],
"enabled": bool(task.get("enabled", True)),
"openai_base_url": str(provider.get("base_url", settings.default_openai_base_url)),
"openai_base_url": normalized_base_url,
"openai_model": str(task.get("model", settings.default_openai_model)),
"openai_timeout_seconds": int(provider.get("timeout_seconds", settings.default_openai_timeout_seconds)),
"openai_api_key": str(provider.get("api_key", "")),

View File

@@ -299,17 +299,24 @@ def extract_text_content(filename: str, data: bytes, mime_type: str) -> Extracti
)
def extract_archive_members(data: bytes, depth: int = 0) -> list[ArchiveMember]:
"""Extracts processable ZIP members within configured decompression safety budgets."""
def extract_archive_members(data: bytes, depth: int = 0, max_members: int | None = None) -> list[ArchiveMember]:
"""Extracts processable ZIP members with depth-aware and decompression safety guardrails."""
members: list[ArchiveMember] = []
if depth > settings.max_zip_depth:
normalized_depth = max(0, depth)
if normalized_depth >= settings.max_zip_depth:
return members
member_limit = settings.max_zip_members
if max_members is not None:
member_limit = max(0, min(settings.max_zip_members, int(max_members)))
if member_limit <= 0:
return members
total_uncompressed_bytes = 0
try:
with zipfile.ZipFile(io.BytesIO(data)) as archive:
infos = [info for info in archive.infolist() if not info.is_dir()][: settings.max_zip_members]
infos = [info for info in archive.infolist() if not info.is_dir()][:member_limit]
for info in infos:
if info.file_size <= 0:
continue

View File

@@ -10,6 +10,7 @@ from typing import Any
from openai import APIConnectionError, APIError, APITimeoutError, OpenAI
from PIL import Image, ImageOps
from app.core.config import normalize_and_validate_provider_base_url
from app.services.app_settings import DEFAULT_OCR_PROMPT, read_handwriting_provider_settings
MAX_IMAGE_SIDE = 2000
@@ -151,12 +152,17 @@ def _normalize_image_bytes(image_data: bytes) -> tuple[bytes, str]:
def _create_client(provider_settings: dict[str, Any]) -> OpenAI:
"""Creates an OpenAI client configured for compatible endpoints and timeouts."""
"""Creates an OpenAI client configured with DNS-revalidated endpoint and request timeout controls."""
api_key = str(provider_settings.get("openai_api_key", "")).strip() or "no-key-required"
raw_base_url = str(provider_settings.get("openai_base_url", "")).strip()
try:
normalized_base_url = normalize_and_validate_provider_base_url(raw_base_url, resolve_dns=True)
except ValueError as error:
raise HandwritingTranscriptionError(f"invalid_provider_base_url:{error}") from error
return OpenAI(
api_key=api_key,
base_url=str(provider_settings["openai_base_url"]),
base_url=normalized_base_url,
timeout=int(provider_settings["openai_timeout_seconds"]),
)

View File

@@ -3,16 +3,17 @@
from redis import Redis
from rq import Queue
from app.core.config import get_settings
from app.core.config import get_settings, validate_redis_url_security
settings = get_settings()
def get_redis() -> Redis:
"""Creates a Redis connection from configured URL."""
"""Creates a Redis connection after enforcing URL security policy checks."""
return Redis.from_url(settings.redis_url)
secure_redis_url = validate_redis_url_security(settings.redis_url)
return Redis.from_url(secure_redis_url)
def get_processing_queue() -> Queue:

View File

@@ -7,6 +7,7 @@ from pathlib import Path
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.db.base import SessionLocal
from app.models.document import Document, DocumentStatus
from app.services.app_settings import (
@@ -37,6 +38,13 @@ from app.services.storage import absolute_path, compute_sha256, store_bytes, wri
from app.worker.queue import get_processing_queue
settings = get_settings()
ARCHIVE_ROOT_ID_METADATA_KEY = "archive_root_document_id"
ARCHIVE_DEPTH_METADATA_KEY = "archive_depth"
ARCHIVE_DESCENDANT_COUNT_METADATA_KEY = "archive_descendant_count"
def _cleanup_processing_logs_with_settings(session: Session) -> None:
"""Applies configured processing log retention while trimming old log entries."""
@@ -48,13 +56,80 @@ def _cleanup_processing_logs_with_settings(session: Session) -> None:
)
def _metadata_non_negative_int(value: object, fallback: int = 0) -> int:
"""Parses metadata values as non-negative integers with safe fallback behavior."""
try:
parsed = int(value)
except (TypeError, ValueError):
return fallback
return max(0, parsed)
def _metadata_uuid(value: object) -> uuid.UUID | None:
"""Parses metadata values as UUIDs while tolerating malformed legacy values."""
if not isinstance(value, str) or not value.strip():
return None
try:
return uuid.UUID(value.strip())
except ValueError:
return None
def _resolve_archive_lineage(session: Session, document: Document) -> tuple[uuid.UUID, int]:
"""Resolves archive root document id and depth for metadata propagation compatibility."""
metadata_json = dict(document.metadata_json)
metadata_root = _metadata_uuid(metadata_json.get(ARCHIVE_ROOT_ID_METADATA_KEY))
metadata_depth = _metadata_non_negative_int(metadata_json.get(ARCHIVE_DEPTH_METADATA_KEY), fallback=0)
if metadata_root is not None:
return metadata_root, metadata_depth
if not document.is_archive_member:
return document.id, 0
depth = 0
root_document_id = document.id
parent_document_id = document.parent_document_id
visited: set[uuid.UUID] = {document.id}
while parent_document_id is not None and parent_document_id not in visited:
visited.add(parent_document_id)
parent_document = session.execute(select(Document).where(Document.id == parent_document_id)).scalar_one_or_none()
if parent_document is None:
break
depth += 1
root_document_id = parent_document.id
parent_document_id = parent_document.parent_document_id
return root_document_id, depth
def _merge_archive_metadata(document: Document, **updates: object) -> None:
"""Applies archive metadata updates while preserving unrelated document metadata keys."""
metadata_json = dict(document.metadata_json)
metadata_json.update(updates)
document.metadata_json = metadata_json
def _load_archive_root_for_update(session: Session, root_document_id: uuid.UUID) -> Document | None:
"""Loads archive root row with write lock to serialize descendant-count budget updates."""
return session.execute(
select(Document).where(Document.id == root_document_id).with_for_update()
).scalar_one_or_none()
def _create_archive_member_document(
parent: Document,
member_name: str,
member_data: bytes,
mime_type: str,
archive_root_document_id: uuid.UUID,
archive_depth: int,
) -> Document:
"""Creates a child document entity for a file extracted from an uploaded archive."""
"""Creates child document entities with lineage metadata for recursive archive processing."""
extension = Path(member_name).suffix.lower()
stored_relative_path = store_bytes(member_name, member_data)
@@ -68,7 +143,12 @@ def _create_archive_member_document(
size_bytes=len(member_data),
logical_path=parent.logical_path,
tags=list(parent.tags),
metadata_json={"origin": "archive", "parent": str(parent.id)},
metadata_json={
"origin": "archive",
"parent": str(parent.id),
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
},
is_archive_member=True,
archived_member_path=member_name,
parent_document_id=parent.id,
@@ -110,16 +190,46 @@ def process_document_task(document_id: str) -> None:
if document.extension == ".zip":
child_ids: list[str] = []
archive_root_document_id, archive_depth = _resolve_archive_lineage(session=session, document=document)
_merge_archive_metadata(
document,
**{
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
},
)
root_document = _load_archive_root_for_update(session=session, root_document_id=archive_root_document_id)
if root_document is None:
root_document = document
root_metadata_json = dict(root_document.metadata_json)
existing_descendant_count = _metadata_non_negative_int(
root_metadata_json.get(ARCHIVE_DESCENDANT_COUNT_METADATA_KEY),
fallback=0,
)
max_descendants_per_root = max(0, int(settings.max_zip_descendants_per_root))
remaining_descendant_budget = max(0, max_descendants_per_root - existing_descendant_count)
extraction_member_cap = remaining_descendant_budget
log_processing_event(
session=session,
stage="archive",
event="Archive extraction started",
level="info",
document=document,
payload_json={"size_bytes": len(data)},
payload_json={
"size_bytes": len(data),
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth,
"remaining_descendant_budget": remaining_descendant_budget,
},
)
try:
members = extract_archive_members(data)
members = extract_archive_members(
data,
depth=archive_depth,
max_members=extraction_member_cap,
)
for member in members:
mime_type = sniff_mime(member.data)
child = _create_archive_member_document(
@@ -127,6 +237,8 @@ def process_document_task(document_id: str) -> None:
member_name=member.name,
member_data=member.data,
mime_type=mime_type,
archive_root_document_id=archive_root_document_id,
archive_depth=archive_depth + 1,
)
session.add(child)
session.flush()
@@ -142,8 +254,27 @@ def process_document_task(document_id: str) -> None:
"member_name": member.name,
"member_size_bytes": len(member.data),
"mime_type": mime_type,
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth + 1,
},
)
updated_root_metadata = dict(root_document.metadata_json)
updated_root_metadata[ARCHIVE_ROOT_ID_METADATA_KEY] = str(archive_root_document_id)
updated_root_metadata[ARCHIVE_DEPTH_METADATA_KEY] = 0
updated_root_metadata[ARCHIVE_DESCENDANT_COUNT_METADATA_KEY] = existing_descendant_count + len(child_ids)
root_document.metadata_json = updated_root_metadata
limit_flags: dict[str, object] = {}
if archive_depth >= settings.max_zip_depth:
limit_flags["max_depth_reached"] = True
if remaining_descendant_budget <= 0:
limit_flags["max_descendants_reached"] = True
elif len(child_ids) >= remaining_descendant_budget:
limit_flags["max_descendants_reached"] = True
if limit_flags:
_merge_archive_metadata(document, **limit_flags)
document.status = DocumentStatus.PROCESSED
document.extracted_text = f"archive with {len(members)} files"
log_processing_event(
@@ -152,7 +283,13 @@ def process_document_task(document_id: str) -> None:
event="Archive extraction completed",
level="info",
document=document,
payload_json={"member_count": len(members)},
payload_json={
"member_count": len(members),
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth,
"descendant_count": existing_descendant_count + len(child_ids),
"remaining_descendant_budget": max(0, remaining_descendant_budget - len(child_ids)),
},
)
except Exception as exc:
document.status = DocumentStatus.ERROR
@@ -231,7 +368,10 @@ def process_document_task(document_id: str) -> None:
event="Archive child job enqueued",
level="info",
document_id=uuid.UUID(child_id),
payload_json={"parent_document_id": str(document.id)},
payload_json={
"parent_document_id": str(document.id),
"archive_root_document_id": str(archive_root_document_id),
},
)
session.commit()
return

View File

@@ -144,6 +144,87 @@ class AppSettingsProviderResilienceTests(unittest.TestCase):
app_settings.update_app_settings(providers=[provider_update])
write_settings_mock.assert_not_called()
def test_sanitize_settings_migrates_legacy_plaintext_api_key_to_encrypted_field(self) -> None:
"""Legacy plaintext API keys are still readable and emitted with encrypted storage representation."""
payload = {
"providers": [
{
"id": "secure-provider",
"label": "Secure Provider",
"provider_type": "openai_compatible",
"base_url": "https://api.openai.com/v1",
"timeout_seconds": 45,
"api_key": "legacy-plaintext-secret",
}
],
"tasks": {
app_settings.TASK_OCR_HANDWRITING: {"provider_id": "secure-provider"},
app_settings.TASK_SUMMARY_GENERATION: {"provider_id": "secure-provider"},
app_settings.TASK_ROUTING_CLASSIFICATION: {"provider_id": "secure-provider"},
},
}
with patch.object(app_settings, "_derive_provider_api_key_key", return_value=b"k" * 32):
sanitized = app_settings._sanitize_settings(payload)
provider = sanitized["providers"][0]
self.assertEqual(provider["api_key"], "legacy-plaintext-secret")
self.assertTrue(
str(provider.get("api_key_encrypted", "")).startswith(
f"{app_settings.PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:"
)
)
def test_serialize_settings_for_storage_excludes_plaintext_api_key(self) -> None:
"""Storage payload serialization persists encrypted provider API keys only."""
payload = _sample_current_payload()
payload["providers"][0]["api_key"] = "storage-secret"
payload["providers"][0]["api_key_encrypted"] = ""
with patch.object(app_settings, "_derive_provider_api_key_key", return_value=b"s" * 32):
storage_payload = app_settings._serialize_settings_for_storage(payload)
provider_storage = storage_payload["providers"][0]
self.assertNotIn("api_key", provider_storage)
self.assertTrue(
str(provider_storage.get("api_key_encrypted", "")).startswith(
f"{app_settings.PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:"
)
)
def test_read_handwriting_provider_settings_revalidates_dns(self) -> None:
"""OCR runtime provider settings enforce DNS revalidation before creating outbound clients."""
runtime_payload = {
"provider": {
"id": "openai-default",
"provider_type": "openai_compatible",
"base_url": "https://api.openai.com/v1",
"timeout_seconds": 45,
"api_key": "runtime-secret",
},
"task": {
"enabled": True,
"model": "gpt-4.1-mini",
"prompt": "prompt",
},
}
with (
patch.object(app_settings, "read_task_runtime_settings", return_value=runtime_payload),
patch.object(
app_settings,
"normalize_and_validate_provider_base_url",
return_value="https://api.openai.com/v1",
) as normalize_mock,
):
runtime_settings = app_settings.read_handwriting_provider_settings()
normalize_mock.assert_called_once_with("https://api.openai.com/v1", resolve_dns=True)
self.assertEqual(runtime_settings["openai_base_url"], "https://api.openai.com/v1")
self.assertEqual(runtime_settings["openai_api_key"], "runtime-secret")
if __name__ == "__main__":
unittest.main()

View File

@@ -3,12 +3,15 @@
from __future__ import annotations
from datetime import UTC, datetime
import io
import socket
import sys
import uuid
from pathlib import Path
from types import ModuleType, SimpleNamespace
import unittest
from unittest.mock import patch
import zipfile
BACKEND_ROOT = Path(__file__).resolve().parents[1]
@@ -83,6 +86,191 @@ if "fastapi.security" not in sys.modules:
fastapi_security_stub.HTTPBearer = _HTTPBearer
sys.modules["fastapi.security"] = fastapi_security_stub
if "magic" not in sys.modules:
magic_stub = ModuleType("magic")
def _from_buffer(_data: bytes, mime: bool = True) -> str:
"""Returns deterministic fallback MIME values for extractor import stubs."""
return "application/octet-stream" if mime else ""
magic_stub.from_buffer = _from_buffer
sys.modules["magic"] = magic_stub
if "docx" not in sys.modules:
docx_stub = ModuleType("docx")
class _DocxDocument:
"""Minimal docx document stub for extractor import compatibility."""
def __init__(self, *_args: object, **_kwargs: object) -> None:
self.paragraphs: list[SimpleNamespace] = []
docx_stub.Document = _DocxDocument
sys.modules["docx"] = docx_stub
if "openpyxl" not in sys.modules:
openpyxl_stub = ModuleType("openpyxl")
class _Workbook:
"""Minimal workbook stub for extractor import compatibility."""
worksheets: list[SimpleNamespace] = []
def _load_workbook(*_args: object, **_kwargs: object) -> _Workbook:
"""Returns deterministic workbook placeholder for extractor import stubs."""
return _Workbook()
openpyxl_stub.load_workbook = _load_workbook
sys.modules["openpyxl"] = openpyxl_stub
if "PIL" not in sys.modules:
pil_stub = ModuleType("PIL")
class _Image:
"""Minimal PIL.Image replacement for extractor and handwriting import stubs."""
class Resampling:
"""Minimal enum-like namespace used by handwriting image resize path."""
LANCZOS = 1
@staticmethod
def open(*_args: object, **_kwargs: object) -> "_Image":
"""Raises for unsupported image operations in dependency-light tests."""
raise RuntimeError("Image.open is not available in stub")
class _ImageOps:
"""Minimal PIL.ImageOps replacement for import compatibility."""
@staticmethod
def exif_transpose(image: object) -> object:
"""Returns original image object unchanged in dependency-light tests."""
return image
pil_stub.Image = _Image
pil_stub.ImageOps = _ImageOps
sys.modules["PIL"] = pil_stub
if "pypdf" not in sys.modules:
pypdf_stub = ModuleType("pypdf")
class _PdfReader:
"""Minimal PdfReader replacement for extractor import compatibility."""
def __init__(self, *_args: object, **_kwargs: object) -> None:
self.pages: list[SimpleNamespace] = []
pypdf_stub.PdfReader = _PdfReader
sys.modules["pypdf"] = pypdf_stub
if "pymupdf" not in sys.modules:
pymupdf_stub = ModuleType("pymupdf")
class _Matrix:
"""Minimal matrix placeholder for extractor import compatibility."""
def __init__(self, *_args: object, **_kwargs: object) -> None:
pass
def _open(*_args: object, **_kwargs: object) -> object:
"""Raises when preview rendering is invoked in dependency-light tests."""
raise RuntimeError("pymupdf is not available in stub")
pymupdf_stub.Matrix = _Matrix
pymupdf_stub.open = _open
sys.modules["pymupdf"] = pymupdf_stub
if "app.services.handwriting" not in sys.modules:
handwriting_stub = ModuleType("app.services.handwriting")
class _HandwritingError(Exception):
"""Minimal base error class for extractor import compatibility."""
class _HandwritingNotConfiguredError(_HandwritingError):
"""Minimal not-configured error class for extractor import compatibility."""
class _HandwritingTimeoutError(_HandwritingError):
"""Minimal timeout error class for extractor import compatibility."""
def _classify_image_text_bytes(*_args: object, **_kwargs: object) -> SimpleNamespace:
"""Returns deterministic image text classification fallback."""
return SimpleNamespace(label="unknown", confidence=0.0, provider="stub", model="stub")
def _transcribe_handwriting_bytes(*_args: object, **_kwargs: object) -> SimpleNamespace:
"""Returns deterministic handwriting transcription fallback."""
return SimpleNamespace(text="", uncertainties=[], provider="stub", model="stub")
handwriting_stub.IMAGE_TEXT_TYPE_NO_TEXT = "no_text"
handwriting_stub.IMAGE_TEXT_TYPE_UNKNOWN = "unknown"
handwriting_stub.IMAGE_TEXT_TYPE_HANDWRITING = "handwriting"
handwriting_stub.HandwritingTranscriptionError = _HandwritingError
handwriting_stub.HandwritingTranscriptionNotConfiguredError = _HandwritingNotConfiguredError
handwriting_stub.HandwritingTranscriptionTimeoutError = _HandwritingTimeoutError
handwriting_stub.classify_image_text_bytes = _classify_image_text_bytes
handwriting_stub.transcribe_handwriting_bytes = _transcribe_handwriting_bytes
sys.modules["app.services.handwriting"] = handwriting_stub
if "app.services.handwriting_style" not in sys.modules:
handwriting_style_stub = ModuleType("app.services.handwriting_style")
def _assign_handwriting_style(*_args: object, **_kwargs: object) -> SimpleNamespace:
"""Returns deterministic style assignment payload for worker import compatibility."""
return SimpleNamespace(
style_cluster_id="cluster-1",
matched_existing=False,
similarity=0.0,
vector_distance=0.0,
compared_neighbors=0,
match_min_similarity=0.0,
bootstrap_match_min_similarity=0.0,
)
def _delete_handwriting_style_document(*_args: object, **_kwargs: object) -> None:
"""No-op style document delete stub for worker import compatibility."""
return None
handwriting_style_stub.assign_handwriting_style = _assign_handwriting_style
handwriting_style_stub.delete_handwriting_style_document = _delete_handwriting_style_document
sys.modules["app.services.handwriting_style"] = handwriting_style_stub
if "app.services.routing_pipeline" not in sys.modules:
routing_pipeline_stub = ModuleType("app.services.routing_pipeline")
def _apply_routing_decision(*_args: object, **_kwargs: object) -> None:
"""No-op routing application stub for worker import compatibility."""
return None
def _classify_document_routing(*_args: object, **_kwargs: object) -> dict[str, object]:
"""Returns deterministic routing decision payload for worker import compatibility."""
return {"chosen_path": None, "chosen_tags": []}
def _summarize_document(*_args: object, **_kwargs: object) -> str:
"""Returns deterministic summary text for worker import compatibility."""
return "summary"
def _upsert_semantic_index(*_args: object, **_kwargs: object) -> None:
"""No-op semantic index update stub for worker import compatibility."""
return None
routing_pipeline_stub.apply_routing_decision = _apply_routing_decision
routing_pipeline_stub.classify_document_routing = _classify_document_routing
routing_pipeline_stub.summarize_document = _summarize_document
routing_pipeline_stub.upsert_semantic_index = _upsert_semantic_index
sys.modules["app.services.routing_pipeline"] = routing_pipeline_stub
from fastapi import HTTPException
from fastapi.security import HTTPAuthorizationCredentials
@@ -90,6 +278,8 @@ from app.api.auth import AuthRole, get_request_role, require_admin
from app.core import config as config_module
from app.models.processing_log import sanitize_processing_log_payload_value, sanitize_processing_log_text
from app.schemas.processing_logs import ProcessingLogEntryResponse
from app.services import extractor as extractor_module
from app.worker import tasks as worker_tasks_module
def _security_settings(
@@ -113,7 +303,12 @@ class AuthDependencyTests(unittest.TestCase):
def test_get_request_role_accepts_admin_token(self) -> None:
"""Admin token resolves admin role."""
settings = SimpleNamespace(admin_api_token="admin-token", user_api_token="user-token")
settings = SimpleNamespace(
admin_api_token="admin-token",
user_api_token="user-token",
allow_development_anonymous_user_access=False,
app_env="production",
)
credentials = HTTPAuthorizationCredentials(scheme="Bearer", credentials="admin-token")
role = get_request_role(credentials=credentials, settings=settings)
self.assertEqual(role, AuthRole.ADMIN)
@@ -121,11 +316,28 @@ class AuthDependencyTests(unittest.TestCase):
def test_get_request_role_rejects_missing_credentials(self) -> None:
"""Missing bearer credentials return 401."""
settings = SimpleNamespace(admin_api_token="admin-token", user_api_token="user-token")
settings = SimpleNamespace(
admin_api_token="admin-token",
user_api_token="user-token",
allow_development_anonymous_user_access=False,
app_env="production",
)
with self.assertRaises(HTTPException) as context:
get_request_role(credentials=None, settings=settings)
self.assertEqual(context.exception.status_code, 401)
def test_get_request_role_allows_tokenless_user_access_in_development(self) -> None:
"""Development mode can allow tokenless user role for compatibility."""
settings = SimpleNamespace(
admin_api_token="admin-token",
user_api_token="user-token",
allow_development_anonymous_user_access=True,
app_env="development",
)
role = get_request_role(credentials=None, settings=settings)
self.assertEqual(role, AuthRole.USER)
def test_require_admin_rejects_user_role(self) -> None:
"""User role cannot access admin-only endpoints."""
@@ -202,6 +414,239 @@ class ProviderBaseUrlValidationTests(unittest.TestCase):
self.assertEqual(getaddrinfo_mock.call_count, 2)
class RedisQueueSecurityTests(unittest.TestCase):
"""Verifies Redis URL security policy behavior for compatibility and strict environments."""
def test_auto_mode_allows_insecure_redis_url_in_development(self) -> None:
"""Development mode stays backward-compatible with local unauthenticated redis URLs."""
normalized = config_module.validate_redis_url_security(
"redis://redis:6379/0",
app_env="development",
security_mode="auto",
tls_mode="auto",
)
self.assertEqual(normalized, "redis://redis:6379/0")
def test_auto_mode_rejects_missing_auth_in_production(self) -> None:
"""Production auto mode fails closed when Redis URL omits authentication."""
with self.assertRaises(ValueError):
config_module.validate_redis_url_security(
"rediss://redis:6379/0",
app_env="production",
security_mode="auto",
tls_mode="auto",
)
def test_auto_mode_rejects_plaintext_redis_in_production(self) -> None:
"""Production auto mode requires TLS transport for Redis URLs."""
with self.assertRaises(ValueError):
config_module.validate_redis_url_security(
"redis://:password@redis:6379/0",
app_env="production",
security_mode="auto",
tls_mode="auto",
)
def test_strict_mode_enforces_auth_and_tls_outside_production(self) -> None:
"""Strict mode enforces production-grade Redis controls in all environments."""
with self.assertRaises(ValueError):
config_module.validate_redis_url_security(
"redis://redis:6379/0",
app_env="development",
security_mode="strict",
tls_mode="auto",
)
normalized = config_module.validate_redis_url_security(
"rediss://:password@redis:6379/0",
app_env="development",
security_mode="strict",
tls_mode="auto",
)
self.assertEqual(normalized, "rediss://:password@redis:6379/0")
def test_compat_mode_allows_insecure_redis_in_production_for_safe_migration(self) -> None:
"""Compatibility mode keeps legacy production Redis URLs usable during migration windows."""
normalized = config_module.validate_redis_url_security(
"redis://redis:6379/0",
app_env="production",
security_mode="compat",
tls_mode="allow_insecure",
)
self.assertEqual(normalized, "redis://redis:6379/0")
class PreviewMimeSafetyTests(unittest.TestCase):
"""Verifies inline preview MIME safety checks for uploaded document responses."""
def test_preview_blocks_script_capable_html_and_svg_types(self) -> None:
"""HTML and SVG MIME types are rejected for inline preview responses."""
self.assertFalse(config_module.is_inline_preview_mime_type_safe("text/html"))
self.assertFalse(config_module.is_inline_preview_mime_type_safe("image/svg+xml"))
def test_preview_allows_pdf_and_safe_image_types(self) -> None:
"""PDF and raster image MIME types stay eligible for inline preview responses."""
self.assertTrue(config_module.is_inline_preview_mime_type_safe("application/pdf"))
self.assertTrue(config_module.is_inline_preview_mime_type_safe("image/png"))
def _build_zip_bytes(entries: dict[str, bytes]) -> bytes:
"""Builds in-memory ZIP bytes for archive extraction guardrail tests."""
output = io.BytesIO()
with zipfile.ZipFile(output, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
for filename, payload in entries.items():
archive.writestr(filename, payload)
return output.getvalue()
class ArchiveExtractionGuardrailTests(unittest.TestCase):
"""Verifies depth-aware archive extraction and per-call member cap enforcement."""
def test_extract_archive_members_rejects_depth_at_configured_limit(self) -> None:
"""Archive member extraction is disabled at or beyond configured depth ceiling."""
archive_bytes = _build_zip_bytes({"sample.txt": b"sample"})
patched_settings = SimpleNamespace(
max_zip_depth=2,
max_zip_members=250,
max_zip_member_uncompressed_bytes=25 * 1024 * 1024,
max_zip_total_uncompressed_bytes=150 * 1024 * 1024,
max_zip_compression_ratio=120.0,
)
with patch.object(extractor_module, "settings", patched_settings):
members = extractor_module.extract_archive_members(archive_bytes, depth=2)
self.assertEqual(members, [])
def test_extract_archive_members_respects_member_cap_argument(self) -> None:
"""Archive extraction truncates results when caller-provided member cap is lower than archive size."""
archive_bytes = _build_zip_bytes(
{
"one.txt": b"1",
"two.txt": b"2",
"three.txt": b"3",
}
)
patched_settings = SimpleNamespace(
max_zip_depth=3,
max_zip_members=250,
max_zip_member_uncompressed_bytes=25 * 1024 * 1024,
max_zip_total_uncompressed_bytes=150 * 1024 * 1024,
max_zip_compression_ratio=120.0,
)
with patch.object(extractor_module, "settings", patched_settings):
members = extractor_module.extract_archive_members(archive_bytes, depth=0, max_members=1)
self.assertEqual(len(members), 1)
class ArchiveLineagePropagationTests(unittest.TestCase):
"""Verifies archive lineage metadata propagation helpers used by worker descendant queueing."""
def test_create_archive_member_document_persists_lineage_metadata(self) -> None:
"""Child archive documents include root id and incremented depth metadata."""
parent_id = uuid.uuid4()
parent = SimpleNamespace(
id=parent_id,
source_relative_path="uploads/root.zip",
logical_path="Inbox",
tags=["finance"],
)
with (
patch.object(worker_tasks_module, "store_bytes", return_value="stored/path/child.zip"),
patch.object(worker_tasks_module, "compute_sha256", return_value="deadbeef"),
):
child = worker_tasks_module._create_archive_member_document(
parent=parent,
member_name="nested/child.zip",
member_data=b"zip-bytes",
mime_type="application/zip",
archive_root_document_id=parent_id,
archive_depth=1,
)
self.assertEqual(child.parent_document_id, parent_id)
self.assertEqual(child.metadata_json.get(worker_tasks_module.ARCHIVE_ROOT_ID_METADATA_KEY), str(parent_id))
self.assertEqual(child.metadata_json.get(worker_tasks_module.ARCHIVE_DEPTH_METADATA_KEY), 1)
self.assertTrue(child.is_archive_member)
def test_resolve_archive_lineage_prefers_existing_metadata(self) -> None:
"""Existing archive lineage metadata is reused without traversing parent relationships."""
root_id = uuid.uuid4()
document = SimpleNamespace(
id=uuid.uuid4(),
metadata_json={
worker_tasks_module.ARCHIVE_ROOT_ID_METADATA_KEY: str(root_id),
worker_tasks_module.ARCHIVE_DEPTH_METADATA_KEY: 3,
},
is_archive_member=True,
parent_document_id=uuid.uuid4(),
)
class _SessionShouldNotBeUsed:
"""Fails test if lineage helper performs unnecessary parent traversals."""
def execute(self, _statement: object) -> object:
raise AssertionError("session.execute should not be called when metadata is present")
resolved_root, resolved_depth = worker_tasks_module._resolve_archive_lineage(
session=_SessionShouldNotBeUsed(),
document=document,
)
self.assertEqual(resolved_root, root_id)
self.assertEqual(resolved_depth, 3)
def test_resolve_archive_lineage_walks_parent_chain_when_metadata_missing(self) -> None:
"""Lineage fallback traverses parent references to recover root id and depth."""
root_id = uuid.uuid4()
parent_id = uuid.uuid4()
root_document = SimpleNamespace(id=root_id, parent_document_id=None)
parent_document = SimpleNamespace(id=parent_id, parent_document_id=root_id)
document = SimpleNamespace(
id=uuid.uuid4(),
metadata_json={},
is_archive_member=True,
parent_document_id=parent_id,
)
class _ScalarResult:
"""Wraps scalar ORM results for deterministic worker helper tests."""
def __init__(self, value: object) -> None:
self._value = value
def scalar_one_or_none(self) -> object:
return self._value
class _SequenceSession:
"""Returns predetermined parent rows in traversal order."""
def __init__(self, values: list[object]) -> None:
self._values = values
def execute(self, _statement: object) -> _ScalarResult:
next_value = self._values.pop(0) if self._values else None
return _ScalarResult(next_value)
resolved_root, resolved_depth = worker_tasks_module._resolve_archive_lineage(
session=_SequenceSession([parent_document, root_document]),
document=document,
)
self.assertEqual(resolved_root, root_id)
self.assertEqual(resolved_depth, 2)
class ProcessingLogRedactionTests(unittest.TestCase):
"""Verifies sensitive processing-log values are redacted for persistence and responses."""

View File

@@ -12,7 +12,8 @@ Primary implementation modules:
## Authentication And Authorization
- Protected endpoints require `Authorization: Bearer <token>`.
- Protected endpoints require `Authorization: Bearer <token>` in production.
- Development deployments can allow tokenless user-role access for `documents/*` and `search/*` when `ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS=true`.
- `ADMIN_API_TOKEN` is required for all privileged access and acts as fail-closed root credential.
- `USER_API_TOKEN` is optional and, when configured, grants access to document endpoints only.
- Authorization matrix:
@@ -29,7 +30,8 @@ Primary implementation modules:
## Documents
- Access: admin or user token required
- Access: admin or user token required (production)
- Access: admin or user token, or development tokenless user fallback when enabled
### Collection and metadata helpers
@@ -56,7 +58,8 @@ Primary implementation modules:
- `GET /documents/{document_id}/download`
- Response: original file bytes
- `GET /documents/{document_id}/preview`
- Response: inline preview stream where browser-supported
- Response: inline preview stream only for safe MIME types
- Behavior: script-capable MIME types are forced to attachment responses with `X-Content-Type-Options: nosniff`
- `GET /documents/{document_id}/thumbnail`
- Response: generated thumbnail image when available
- `GET /documents/{document_id}/content-md`
@@ -126,10 +129,12 @@ Primary implementation modules:
- `GET /settings`
- Response model: `AppSettingsResponse`
- persisted providers with invalid base URLs are ignored during read sanitization; response falls back to remaining valid providers or secure defaults
- provider API keys are exposed only as `api_key_set` and `api_key_masked`
- `PATCH /settings`
- Body model: `AppSettingsUpdateRequest`
- Response model: `AppSettingsResponse`
- rejects invalid provider base URLs with `400` when scheme, allowlist, or network safety checks fail
- provider API keys are persisted encrypted at rest (`api_key_encrypted`) and plaintext keys are not written to storage
- `POST /settings/reset`
- Response model: `AppSettingsResponse`
- `PATCH /settings/handwriting`

View File

@@ -6,9 +6,9 @@ DMS runs as a multi-service application defined in `docker-compose.yml`:
- `frontend` serves the React UI on port `5173`
- `api` serves FastAPI on port `8000`
- `worker` executes asynchronous extraction and indexing jobs
- `db` provides PostgreSQL persistence on port `5432`
- `redis` backs queueing on port `6379`
- `typesense` stores search index and vector-adjacent metadata on port `8108`
- `db` provides PostgreSQL persistence on the internal compose network
- `redis` backs queueing on the internal compose network
- `typesense` stores search index and vector-adjacent metadata on the internal compose network
## Backend Architecture
@@ -64,3 +64,8 @@ Persistent data:
Transient runtime state:
- Redis queues processing tasks and worker execution state
- frontend local component state drives active filters, selection, and modal flows
Security-sensitive runtime behavior:
- Redis connection URLs are validated by backend queue helpers with environment-aware auth and TLS policy enforcement.
- Inline preview is limited to safe MIME types and script-capable content is served as attachment-only.
- Archive fan-out processing propagates root and depth lineage metadata and enforces depth and per-root descendant caps.

View File

@@ -52,9 +52,11 @@ Do not hardcode new palette or spacing values in component styles when a token a
## Authenticated Media Delivery
- Document previews and thumbnails must load through authenticated fetch flows in `frontend/src/lib/api.ts`, then render via temporary object URLs.
- Runtime auth should prefer per-user token resolution (`setApiTokenResolver` and `setRuntimeApiToken`) rather than static build-time token distribution, with `VITE_API_TOKEN` used only as fallback compatibility.
- Direct `window.open` calls for protected media endpoints are not allowed because browser navigation requests do not include the API token header.
- Download actions for original files and markdown exports must use authenticated blob fetches plus controlled browser download triggers.
- Revoke all temporary object URLs after replacement, unmount, or completion to prevent browser memory leaks.
- `DocumentViewer` iframe previews must be restricted to safe MIME types and rendered with `sandbox`, restrictive `allow`, and `referrerPolicy="no-referrer"` attributes. Active or script-capable formats must not be embedded inline.
## Extension Checklist

View File

@@ -3,12 +3,12 @@
## Runtime Services
`docker-compose.yml` defines the runtime stack:
- `db` (Postgres 16, localhost-bound port `5432`)
- `redis` (Redis 7, localhost-bound port `6379`)
- `typesense` (Typesense 29, localhost-bound port `8108`)
- `api` (FastAPI backend, localhost-bound port `8000`)
- `db` (Postgres 16, internal network only)
- `redis` (Redis 7, internal network only, password-protected)
- `typesense` (Typesense 29, internal network only)
- `api` (FastAPI backend, host-bound port `8000`)
- `worker` (RQ background worker)
- `frontend` (Vite UI, localhost-bound port `5173`)
- `frontend` (Vite UI, host-bound port `5173`)
## Named Volumes
@@ -44,14 +44,22 @@ Tail logs:
docker compose logs -f
```
Before running compose, provide explicit API tokens in your shell or project `.env` file:
Before running compose, provide required credentials in your shell or project `.env` file:
```bash
export POSTGRES_USER="dcm"
export POSTGRES_PASSWORD="<random-postgres-password>"
export POSTGRES_DB="dcm"
export DATABASE_URL="postgresql+psycopg://<user>:<password>@db:5432/<db>"
export REDIS_PASSWORD="<random-redis-password>"
export REDIS_URL="redis://:<password>@redis:6379/0"
export ADMIN_API_TOKEN="<random-admin-token>"
export USER_API_TOKEN="<random-user-token>"
export APP_SETTINGS_ENCRYPTION_KEY="<random-settings-encryption-key>"
export TYPESENSE_API_KEY="<random-typesense-key>"
```
Compose now fails fast if either token variable is missing.
Compose fails fast when required credential variables are missing.
## Backend Configuration
@@ -63,9 +71,13 @@ Key environment variables used by `api` and `worker` in compose:
- `APP_ENV`
- `DATABASE_URL`
- `REDIS_URL`
- `REDIS_SECURITY_MODE`
- `REDIS_TLS_MODE`
- `ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS`
- `STORAGE_ROOT`
- `ADMIN_API_TOKEN`
- `USER_API_TOKEN`
- `APP_SETTINGS_ENCRYPTION_KEY`
- `PUBLIC_BASE_URL`
- `CORS_ORIGINS` (API service)
- `PROVIDER_BASE_URL_ALLOWLIST`
@@ -84,6 +96,7 @@ Selected defaults from `Settings` (`backend/app/core/config.py`):
- `max_upload_request_size_bytes = 104857600`
- `max_zip_members = 250`
- `max_zip_depth = 2`
- `max_zip_descendants_per_root = 1000`
- `max_zip_member_uncompressed_bytes = 26214400`
- `max_zip_total_uncompressed_bytes = 157286400`
- `max_zip_compression_ratio = 120.0`
@@ -101,11 +114,15 @@ Selected defaults from `Settings` (`backend/app/core/config.py`):
Frontend runtime API target:
- `VITE_API_BASE` in `docker-compose.yml` frontend service
- `VITE_API_TOKEN` in `docker-compose.yml` frontend service (defaults to `USER_API_TOKEN` in compose, override to `ADMIN_API_TOKEN` when admin-only routes are needed)
- `VITE_API_TOKEN` in `docker-compose.yml` frontend service (optional compatibility fallback only)
Frontend API authentication behavior:
- `frontend/src/lib/api.ts` adds `Authorization: Bearer <VITE_API_TOKEN>` for all API requests only when `VITE_API_TOKEN` is non-empty
- requests are still sent without authorization when `VITE_API_TOKEN` is unset, which keeps unauthenticated endpoints such as `/api/v1/health` backward-compatible
- `frontend/src/lib/api.ts` resolves bearer tokens at request time in this order:
- custom runtime resolver (`setApiTokenResolver`)
- runtime global token (`window.__DCM_API_TOKEN__`)
- session token (`setRuntimeApiToken`)
- legacy `VITE_API_TOKEN` fallback
- requests are sent without authorization only when no runtime or fallback token source is available
Frontend container runtime behavior:
- the container runs as non-root `node`
@@ -136,6 +153,8 @@ Settings include:
Read sanitization is resilient to corrupt persisted provider rows. If a persisted provider entry fails URL validation, the entry is skipped and defaults are used when no valid provider remains. This prevents unrelated read endpoints from failing due to stale invalid provider data.
Provider API keys are persisted as encrypted payloads (`api_key_encrypted`) and plaintext `api_key` values are no longer written to disk.
Retention settings are used by worker cleanup and by `POST /api/v1/processing/logs/trim` when trim query values are not provided.
## Security Controls
@@ -143,18 +162,21 @@ Retention settings are used by worker cleanup and by `POST /api/v1/processing/lo
- Privileged APIs are token-gated with bearer auth:
- `documents` endpoints: user token or admin token
- `settings` and `processing/logs` endpoints: admin token only
- Authentication fails closed when `ADMIN_API_TOKEN` is not configured.
- Development environments can allow tokenless user-role access for document/search routes via `ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS=true`; production remains token-enforced.
- Authentication fails closed when `ADMIN_API_TOKEN` is not configured and admin access is requested.
- Document preview endpoint blocks inline rendering for script-capable MIME types and forces attachment responses for active content.
- Provider base URLs are validated on settings updates and before outbound model calls:
- allowlist enforcement (`PROVIDER_BASE_URL_ALLOWLIST`)
- scheme restrictions (`https` by default)
- local/private-network blocking and per-request DNS revalidation checks for outbound runtime calls
- local/private-network blocking and per-request DNS revalidation checks for outbound runtime calls, including OCR provider path
- Upload and archive safety guards are enforced:
- `POST /api/v1/documents/upload` requires `Content-Length` and enforces file-count, per-file size, and total request size limits
- `OPTIONS /api/v1/documents/upload` CORS preflight is excluded from `Content-Length` enforcement
- ZIP member count, per-member uncompressed size, total decompressed size, and compression-ratio guards
- ZIP member count, per-member uncompressed size, total decompressed size, compression-ratio guards, max depth, and per-root descendant fan-out cap
- Redis queue security checks enforce URL scheme/auth/TLS policy at runtime with production fail-closed defaults.
- Processing logs redact sensitive payload and text fields, and trim endpoints enforce retention caps from runtime config.
- Compose hardening defaults:
- host ports bind to `127.0.0.1` unless `HOST_BIND_IP` override is set
- only `api` and `frontend` publish host ports; `db`, `redis`, and `typesense` stay internal-only
- `api`, `worker`, and `frontend` drop all Linux capabilities and set `no-new-privileges`
- backend and frontend containers run as non-root users by default

View File

@@ -2,23 +2,25 @@ services:
db:
image: postgres:16-alpine
environment:
POSTGRES_USER: dcm
POSTGRES_PASSWORD: dcm
POSTGRES_DB: dcm
ports:
- "${HOST_BIND_IP:-127.0.0.1}:5432:5432"
POSTGRES_USER: ${POSTGRES_USER:?POSTGRES_USER must be set}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set}
POSTGRES_DB: ${POSTGRES_DB:?POSTGRES_DB must be set}
volumes:
- db-data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U dcm -d dcm"]
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:?POSTGRES_USER must be set} -d ${POSTGRES_DB:?POSTGRES_DB must be set}"]
interval: 10s
timeout: 5s
retries: 10
redis:
image: redis:7-alpine
ports:
- "${HOST_BIND_IP:-127.0.0.1}:6379:6379"
command:
- "redis-server"
- "--appendonly"
- "yes"
- "--requirepass"
- "${REDIS_PASSWORD:?REDIS_PASSWORD must be set}"
volumes:
- redis-data:/data
@@ -26,10 +28,8 @@ services:
image: typesense/typesense:29.0
command:
- "--data-dir=/data"
- "--api-key=dcm-typesense-key"
- "--api-key=${TYPESENSE_API_KEY:?TYPESENSE_API_KEY must be set}"
- "--enable-cors"
ports:
- "${HOST_BIND_IP:-127.0.0.1}:8108:8108"
volumes:
- typesense-data:/data
@@ -37,12 +37,16 @@ services:
build:
context: ./backend
environment:
APP_ENV: development
DATABASE_URL: postgresql+psycopg://dcm:dcm@db:5432/dcm
REDIS_URL: redis://redis:6379/0
APP_ENV: ${APP_ENV:-development}
DATABASE_URL: ${DATABASE_URL:?DATABASE_URL must be set}
REDIS_URL: ${REDIS_URL:?REDIS_URL must be set}
REDIS_SECURITY_MODE: ${REDIS_SECURITY_MODE:-auto}
REDIS_TLS_MODE: ${REDIS_TLS_MODE:-auto}
ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS: ${ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS:-true}
STORAGE_ROOT: /data/storage
ADMIN_API_TOKEN: ${ADMIN_API_TOKEN:?ADMIN_API_TOKEN must be set}
USER_API_TOKEN: ${USER_API_TOKEN:?USER_API_TOKEN must be set}
APP_SETTINGS_ENCRYPTION_KEY: ${APP_SETTINGS_ENCRYPTION_KEY:?APP_SETTINGS_ENCRYPTION_KEY must be set}
PROVIDER_BASE_URL_ALLOWLIST: '${PROVIDER_BASE_URL_ALLOWLIST:-["api.openai.com"]}'
PROVIDER_BASE_URL_ALLOW_HTTP: ${PROVIDER_BASE_URL_ALLOW_HTTP:-false}
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK: ${PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK:-false}
@@ -52,7 +56,7 @@ services:
TYPESENSE_PROTOCOL: http
TYPESENSE_HOST: typesense
TYPESENSE_PORT: 8108
TYPESENSE_API_KEY: dcm-typesense-key
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:?TYPESENSE_API_KEY must be set}
TYPESENSE_COLLECTION_NAME: documents
ports:
- "${HOST_BIND_IP:-127.0.0.1}:8000:8000"
@@ -74,14 +78,18 @@ services:
worker:
build:
context: ./backend
command: ["rq", "worker", "dcm", "--url", "redis://redis:6379/0"]
command: ["sh", "-c", "rq worker dcm --url \"$REDIS_URL\""]
environment:
APP_ENV: development
DATABASE_URL: postgresql+psycopg://dcm:dcm@db:5432/dcm
REDIS_URL: redis://redis:6379/0
APP_ENV: ${APP_ENV:-development}
DATABASE_URL: ${DATABASE_URL:?DATABASE_URL must be set}
REDIS_URL: ${REDIS_URL:?REDIS_URL must be set}
REDIS_SECURITY_MODE: ${REDIS_SECURITY_MODE:-auto}
REDIS_TLS_MODE: ${REDIS_TLS_MODE:-auto}
ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS: ${ALLOW_DEVELOPMENT_ANONYMOUS_USER_ACCESS:-true}
STORAGE_ROOT: /data/storage
ADMIN_API_TOKEN: ${ADMIN_API_TOKEN:?ADMIN_API_TOKEN must be set}
USER_API_TOKEN: ${USER_API_TOKEN:?USER_API_TOKEN must be set}
APP_SETTINGS_ENCRYPTION_KEY: ${APP_SETTINGS_ENCRYPTION_KEY:?APP_SETTINGS_ENCRYPTION_KEY must be set}
PROVIDER_BASE_URL_ALLOWLIST: '${PROVIDER_BASE_URL_ALLOWLIST:-["api.openai.com"]}'
PROVIDER_BASE_URL_ALLOW_HTTP: ${PROVIDER_BASE_URL_ALLOW_HTTP:-false}
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK: ${PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK:-false}
@@ -90,7 +98,7 @@ services:
TYPESENSE_PROTOCOL: http
TYPESENSE_HOST: typesense
TYPESENSE_PORT: 8108
TYPESENSE_API_KEY: dcm-typesense-key
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:?TYPESENSE_API_KEY must be set}
TYPESENSE_COLLECTION_NAME: documents
volumes:
- ./backend/app:/app/app
@@ -112,7 +120,7 @@ services:
context: ./frontend
environment:
VITE_API_BASE: ${VITE_API_BASE:-http://localhost:8000/api/v1}
VITE_API_TOKEN: ${VITE_API_TOKEN:-${USER_API_TOKEN:-}}
VITE_API_TOKEN: ${VITE_API_TOKEN:-}
ports:
- "${HOST_BIND_IP:-127.0.0.1}:5173:5173"
volumes:

View File

@@ -19,6 +19,47 @@ import type { DmsDocument, DmsDocumentDetail } from '../types';
import PathInput from './PathInput';
import TagInput from './TagInput';
const SAFE_IMAGE_PREVIEW_MIME_TYPES = new Set<string>([
'image/bmp',
'image/gif',
'image/jpeg',
'image/jpg',
'image/png',
'image/webp',
]);
const SAFE_IFRAME_PREVIEW_MIME_TYPES = new Set<string>([
'application/json',
'application/pdf',
'text/csv',
'text/markdown',
'text/plain',
]);
/**
* Normalizes MIME values by stripping parameters and lowercasing for stable comparison.
*/
function normalizeMimeType(mimeType: string | null | undefined): string {
if (!mimeType) {
return '';
}
return mimeType.split(';')[0]?.trim().toLowerCase() ?? '';
}
/**
* Resolves whether a MIME type is safe to render as an image preview.
*/
function isSafeImagePreviewMimeType(mimeType: string): boolean {
return SAFE_IMAGE_PREVIEW_MIME_TYPES.has(mimeType);
}
/**
* Resolves whether a MIME type is safe to render inside a sandboxed iframe preview.
*/
function isSafeIframePreviewMimeType(mimeType: string): boolean {
return SAFE_IFRAME_PREVIEW_MIME_TYPES.has(mimeType);
}
/**
* Defines props for the selected document viewer panel.
*/
@@ -60,6 +101,30 @@ export default function DocumentViewer({
const [error, setError] = useState<string | null>(null);
const previewObjectUrlRef = useRef<string | null>(null);
/**
* Resolves normalized MIME type used by preview safety checks.
*/
const previewMimeType = useMemo(() => normalizeMimeType(document?.mime_type), [document?.mime_type]);
/**
* Resolves whether selected document should render as a safe image element in preview.
*/
const isImageDocument = useMemo(() => {
return isSafeImagePreviewMimeType(previewMimeType);
}, [previewMimeType]);
/**
* Resolves whether selected document should render in sandboxed iframe preview.
*/
const canRenderIframePreview = useMemo(() => {
return isSafeIframePreviewMimeType(previewMimeType);
}, [previewMimeType]);
/**
* Resolves whether selected document supports any inline preview mode.
*/
const canRenderInlinePreview = isImageDocument || canRenderIframePreview;
/**
* Syncs editable metadata fields whenever selection changes.
*/
@@ -100,6 +165,12 @@ export default function DocumentViewer({
setIsLoadingPreview(false);
return;
}
if (!canRenderInlinePreview) {
revokePreviewObjectUrl();
setPreviewObjectUrl(null);
setIsLoadingPreview(false);
return;
}
let cancelled = false;
setIsLoadingPreview(true);
@@ -131,7 +202,7 @@ export default function DocumentViewer({
cancelled = true;
revokePreviewObjectUrl();
};
}, [document?.id]);
}, [document?.id, canRenderInlinePreview]);
/**
* Refreshes editable metadata from list updates only while form is clean.
@@ -183,16 +254,6 @@ export default function DocumentViewer({
};
}, [document?.id]);
/**
* Resolves whether selected document should render as an image element in preview.
*/
const isImageDocument = useMemo(() => {
if (!document) {
return false;
}
return document.mime_type.startsWith('image/');
}, [document]);
/**
* Extracts provider/transcription errors from document metadata for user visibility.
*/
@@ -482,11 +543,22 @@ export default function DocumentViewer({
{previewObjectUrl ? (
isImageDocument ? (
<img src={previewObjectUrl} alt={document.original_filename} />
) : canRenderIframePreview ? (
<iframe
src={previewObjectUrl}
title={document.original_filename}
sandbox=""
referrerPolicy="no-referrer"
allow="clipboard-read 'none'; clipboard-write 'none'; geolocation 'none'; microphone 'none'; camera 'none'; payment 'none'; usb 'none'; fullscreen 'none'"
loading="lazy"
/>
) : (
<iframe src={previewObjectUrl} title={document.original_filename} />
<p className="small">Preview blocked for this file type. Download to inspect safely.</p>
)
) : isLoadingPreview ? (
<p className="small">Loading preview...</p>
) : !canRenderInlinePreview ? (
<p className="small">Preview blocked for this file type. Download to inspect safely.</p>
) : (
<p className="small">Preview unavailable for this document.</p>
)}

View File

@@ -1,5 +1,5 @@
// @ts-expect-error Node strip-types runtime requires explicit .ts extension in ESM imports.
import { downloadDocumentContentMarkdown, downloadDocumentFile, getDocumentPreviewBlob, getDocumentThumbnailBlob } from './api.ts';
import { API_TOKEN_RUNTIME_GLOBAL_KEY, downloadDocumentContentMarkdown, downloadDocumentFile, getDocumentPreviewBlob, getDocumentThumbnailBlob, setApiTokenResolver, setRuntimeApiToken, updateDocumentMetadata } from './api.ts';
/**
* Throws when a test condition is false.
@@ -24,16 +24,70 @@ async function assertRejects(action: () => Promise<unknown>, expectedMessage: st
throw new Error(`Expected rejection containing "${expectedMessage}"`);
}
/**
* Converts fetch inputs into a URL string for assertions.
*/
function toRequestUrl(input: RequestInfo | URL): string {
if (typeof input === 'string') {
return input;
}
if (input instanceof URL) {
return input.toString();
}
return input.url;
}
/**
* Creates a minimal session storage implementation for Node-based tests.
*/
function createMemorySessionStorage(): Storage {
const values = new Map<string, string>();
return {
get length(): number {
return values.size;
},
clear(): void {
values.clear();
},
getItem(key: string): string | null {
return values.has(key) ? values.get(key) ?? null : null;
},
key(index: number): string | null {
return Array.from(values.keys())[index] ?? null;
},
removeItem(key: string): void {
values.delete(key);
},
setItem(key: string, value: string): void {
values.set(key, String(value));
},
};
}
/**
* Runs API helper tests for authenticated media and download flows.
*/
async function runApiTests(): Promise<void> {
const originalFetch = globalThis.fetch;
const runtimeGlobalSource = globalThis as typeof globalThis & Record<string, unknown>;
const originalRuntimeGlobalToken = runtimeGlobalSource[API_TOKEN_RUNTIME_GLOBAL_KEY];
const sessionStorageDescriptor = Object.getOwnPropertyDescriptor(globalThis, 'sessionStorage');
try {
Object.defineProperty(globalThis, 'sessionStorage', {
configurable: true,
writable: true,
value: createMemorySessionStorage(),
});
setApiTokenResolver(null);
setRuntimeApiToken(null);
delete runtimeGlobalSource[API_TOKEN_RUNTIME_GLOBAL_KEY];
const requestUrls: string[] = [];
globalThis.fetch = (async (input: RequestInfo | URL): Promise<Response> => {
requestUrls.push(typeof input === 'string' ? input : input.toString());
const requestAuthHeaders: Array<string | null> = [];
globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit): Promise<Response> => {
requestUrls.push(toRequestUrl(input));
requestAuthHeaders.push(new Headers(init?.headers).get('Authorization'));
return new Response('preview-bytes', { status: 200 });
}) as typeof fetch;
@@ -50,6 +104,50 @@ async function runApiTests(): Promise<void> {
requestUrls[1] === 'http://localhost:8000/api/v1/documents/doc-1/preview',
`Unexpected preview URL ${requestUrls[1]}`,
);
assert(requestAuthHeaders[0] === null, `Expected no auth header for thumbnail request, got "${requestAuthHeaders[0]}"`);
assert(requestAuthHeaders[1] === null, `Expected no auth header for preview request, got "${requestAuthHeaders[1]}"`);
setRuntimeApiToken('session-user-token');
globalThis.fetch = (async (_input: RequestInfo | URL, init?: RequestInit): Promise<Response> => {
const authHeader = new Headers(init?.headers).get('Authorization');
assert(authHeader === 'Bearer session-user-token', `Expected session token auth header, got "${authHeader}"`);
return new Response('preview-bytes', { status: 200 });
}) as typeof fetch;
await getDocumentPreviewBlob('doc-session-auth');
setRuntimeApiToken('session-user-token');
runtimeGlobalSource[API_TOKEN_RUNTIME_GLOBAL_KEY] = 'runtime-global-token';
globalThis.fetch = (async (_input: RequestInfo | URL, init?: RequestInit): Promise<Response> => {
const authHeader = new Headers(init?.headers).get('Authorization');
assert(authHeader === 'Bearer runtime-global-token', `Expected global runtime token auth header, got "${authHeader}"`);
return new Response('preview-bytes', { status: 200 });
}) as typeof fetch;
await getDocumentPreviewBlob('doc-global-auth');
setApiTokenResolver(() => 'resolver-token');
let mergedContentType: string | null = null;
let mergedAuthorization: string | null = null;
globalThis.fetch = (async (_input: RequestInfo | URL, init?: RequestInit): Promise<Response> => {
const headers = new Headers(init?.headers);
mergedContentType = headers.get('Content-Type');
mergedAuthorization = headers.get('Authorization');
return new Response('{}', { status: 200 });
}) as typeof fetch;
await updateDocumentMetadata('doc-headers', { original_filename: 'renamed.pdf' });
assert(mergedContentType === 'application/json', `Expected JSON content type to be preserved, got "${mergedContentType}"`);
assert(mergedAuthorization === 'Bearer resolver-token', `Expected resolver token auth header, got "${mergedAuthorization}"`);
setApiTokenResolver(() => ' ');
globalThis.fetch = (async (_input: RequestInfo | URL, init?: RequestInit): Promise<Response> => {
const authHeader = new Headers(init?.headers).get('Authorization');
assert(authHeader === 'Bearer runtime-global-token', `Expected fallback runtime global token auth header, got "${authHeader}"`);
return new Response('preview-bytes', { status: 200 });
}) as typeof fetch;
await getDocumentPreviewBlob('doc-resolver-fallback');
setApiTokenResolver(null);
setRuntimeApiToken(null);
delete runtimeGlobalSource[API_TOKEN_RUNTIME_GLOBAL_KEY];
globalThis.fetch = (async (): Promise<Response> => {
return new Response('file-bytes', {
@@ -78,6 +176,18 @@ async function runApiTests(): Promise<void> {
await assertRejects(async () => downloadDocumentContentMarkdown('doc-4'), 'Failed to download document markdown');
} finally {
setApiTokenResolver(null);
setRuntimeApiToken(null);
if (originalRuntimeGlobalToken === undefined) {
delete runtimeGlobalSource[API_TOKEN_RUNTIME_GLOBAL_KEY];
} else {
runtimeGlobalSource[API_TOKEN_RUNTIME_GLOBAL_KEY] = originalRuntimeGlobalToken;
}
if (sessionStorageDescriptor) {
Object.defineProperty(globalThis, 'sessionStorage', sessionStorageDescriptor);
} else {
delete (globalThis as { sessionStorage?: Storage }).sessionStorage;
}
globalThis.fetch = originalFetch;
}
}

View File

@@ -19,23 +19,123 @@ import type {
const API_BASE = import.meta.env?.VITE_API_BASE ?? 'http://localhost:8000/api/v1';
/**
* Optional bearer token used for authenticated backend routes.
* Legacy environment token fallback used only when no runtime token source is available.
*/
const API_TOKEN = import.meta.env?.VITE_API_TOKEN?.trim();
const LEGACY_API_TOKEN = normalizeBearerToken(import.meta.env?.VITE_API_TOKEN);
/**
* Global property name used for runtime token injection.
*/
export const API_TOKEN_RUNTIME_GLOBAL_KEY = '__DCM_API_TOKEN__';
/**
* Session storage key used for per-user runtime token persistence.
*/
export const API_TOKEN_RUNTIME_STORAGE_KEY = 'dcm.api_token';
/**
* Resolves a bearer token dynamically at request time.
*/
export type ApiTokenResolver = () => string | null | undefined;
let runtimeTokenResolver: ApiTokenResolver | null = null;
type ApiRequestInit = Omit<RequestInit, 'headers'> & { headers?: HeadersInit };
/**
* Merges request headers and appends bearer authorization when configured.
* Normalizes candidate token values by trimming whitespace and filtering non-string values.
*/
function normalizeBearerToken(candidate: unknown): string | undefined {
if (typeof candidate !== 'string') {
return undefined;
}
const normalized = candidate.trim();
return normalized ? normalized : undefined;
}
/**
* Resolves runtime token from mutable global injection points when available.
*/
function resolveGlobalRuntimeToken(): string | undefined {
const source = globalThis as typeof globalThis & Record<string, unknown>;
return normalizeBearerToken(source[API_TOKEN_RUNTIME_GLOBAL_KEY]);
}
/**
* Resolves runtime token from session storage where per-user state can be isolated by browser session.
*/
function resolveSessionStorageToken(): string | undefined {
if (typeof globalThis.sessionStorage === 'undefined') {
return undefined;
}
try {
return normalizeBearerToken(globalThis.sessionStorage.getItem(API_TOKEN_RUNTIME_STORAGE_KEY));
} catch {
return undefined;
}
}
/**
* Resolves bearer token using runtime sources first, then legacy environment fallback for compatibility.
*/
function resolveApiToken(): string | undefined {
const resolverToken = normalizeBearerToken(runtimeTokenResolver?.());
if (resolverToken) {
return resolverToken;
}
const globalRuntimeToken = resolveGlobalRuntimeToken();
if (globalRuntimeToken) {
return globalRuntimeToken;
}
const sessionStorageToken = resolveSessionStorageToken();
if (sessionStorageToken) {
return sessionStorageToken;
}
return LEGACY_API_TOKEN;
}
/**
* Registers or clears a request-time bearer token resolver used by API helpers.
*
* @param resolver Function returning a token for each request, or `null` to remove custom resolution.
*/
export function setApiTokenResolver(resolver: ApiTokenResolver | null): void {
runtimeTokenResolver = resolver;
}
/**
* Stores or clears the per-user runtime API token in session storage.
*
* @param token Token value to persist for this browser session; clears persisted token when empty.
*/
export function setRuntimeApiToken(token: string | null | undefined): void {
if (typeof globalThis.sessionStorage === 'undefined') {
return;
}
try {
const normalized = normalizeBearerToken(token);
if (normalized) {
globalThis.sessionStorage.setItem(API_TOKEN_RUNTIME_STORAGE_KEY, normalized);
return;
}
globalThis.sessionStorage.removeItem(API_TOKEN_RUNTIME_STORAGE_KEY);
} catch {
return;
}
}
/**
* Merges request headers and appends bearer authorization when a token can be resolved.
*/
function buildRequestHeaders(headers?: HeadersInit): Headers | undefined {
if (!API_TOKEN && !headers) {
const apiToken = resolveApiToken();
if (!apiToken && !headers) {
return undefined;
}
const requestHeaders = new Headers(headers);
if (API_TOKEN) {
requestHeaders.set('Authorization', `Bearer ${API_TOKEN}`);
if (apiToken) {
requestHeaders.set('Authorization', `Bearer ${apiToken}`);
}
return requestHeaders;
}