Compare commits

...

49 Commits

Author SHA1 Message Date
60ce69e115 Try a unified api endpoint 2026-03-17 17:27:22 -03:00
d6d0735ff8 Fix cookie not accepted in safari 2026-03-17 16:57:51 -03:00
72088dba9a Fix folder permissions 2026-03-17 16:37:59 -03:00
6f1fffd6e8 Update Typesense 2026-03-17 16:23:14 -03:00
490cbbb812 Normalize compose host bind mount paths 2026-03-02 22:11:33 -03:00
4fe22e3539 Document bind-mount permissions and ignore runtime data tree 2026-03-02 18:58:19 -03:00
3f7cdee995 Update cookie 2026-03-02 18:23:48 -03:00
1a04b23e89 Fix CSRF validation for duplicate cookie values on PATCH 2026-03-02 18:09:27 -03:00
2a5dfc3713 flush 2026-03-02 17:57:59 -03:00
1cd7d6541d update dockerfile 2026-03-02 17:53:26 -03:00
ec6a20ebd1 Stabilize auth cookies for proxied split-domain deployments 2026-03-02 17:50:16 -03:00
83d6a4f367 Remove frontend npm tuning and keep standard install path 2026-03-02 17:31:34 -03:00
8cf3748015 Revert "Harden frontend npm install against transient registry timeouts"
This reverts commit daa11cb768.
2026-03-02 16:58:01 -03:00
daa11cb768 Harden frontend npm install against transient registry timeouts 2026-03-02 16:57:25 -03:00
8f2c357bfc Run production frontend Nginx unprivileged under dropped caps 2026-03-02 16:41:20 -03:00
d50169b883 Serve production frontend via Nginx static build 2026-03-02 15:50:34 -03:00
b5b74845f2 Switch frontend container to production-aware runtime mode 2026-03-02 15:41:39 -03:00
0acce2e260 Wire Vite allowed hosts to env for Docker frontend 2026-03-02 15:37:39 -03:00
b86223f943 update docker compose 2026-03-02 15:24:21 -03:00
8dc4013e76 update docker compose 2026-03-02 15:18:12 -03:00
668c22f692 update docker compose 2026-03-02 15:16:14 -03:00
89ec3584f9 update docker-compose.yml 2026-03-02 15:03:45 -03:00
8dded6383e Use node 22 slim for frontend npm network compatibility 2026-03-02 15:00:02 -03:00
c47fc48533 Harden frontend Docker npm fetch resilience 2026-03-02 14:38:26 -03:00
b6d470590e Update docker compose 2026-03-02 14:26:52 -03:00
41bbe87b4c Update changelog 2026-03-02 13:41:16 -03:00
6fba581865 Rewrite README for end-user Docker setup and env guidance 2026-03-02 13:40:29 -03:00
4b34d6153c Remove report 2026-03-01 21:55:53 -03:00
700f0d6d79 Use version-safe FastAPI CSRF dependency params 2026-03-01 21:44:59 -03:00
3cccf2e0e8 Fix auth route response injection crash 2026-03-01 21:43:09 -03:00
26eae1a09b Fix auth session persistence with HttpOnly cookies and CSRF 2026-03-01 21:39:22 -03:00
a9333ec973 Harden frontend auth token handling in runtime memory 2026-03-01 21:29:11 -03:00
8eaaa01186 update report 2026-03-01 21:25:37 -03:00
eae7afd36e docs: refresh production security assessment report 2026-03-01 21:22:25 -03:00
874597e40b Fix predefined catalog visibility and port security must-know guidance 2026-03-01 21:15:12 -03:00
32b4589b28 docs: update security production readiness report 2026-03-01 21:07:49 -03:00
4c27fd6483 Harden auth login against brute-force and refresh security docs 2026-03-01 18:24:26 -03:00
9cbbd80f47 update report 2026-03-01 18:15:14 -03:00
aba320b617 docs: refresh security production readiness report 2026-03-01 18:03:45 -03:00
74d91eb4b1 Update header styles 2026-03-01 17:55:51 -03:00
1c57084ebf Hardcode CORS credentials disabled and remove env toggle 2026-03-01 17:16:13 -03:00
bfc89fe5ce Revert "Allow private-network CORS origins in development"
This reverts commit 1b2e0cb8af.
2026-03-01 17:12:06 -03:00
1b2e0cb8af Allow private-network CORS origins in development 2026-03-01 17:08:50 -03:00
0242e061c2 Harden auth and security controls with session auth and docs 2026-03-01 15:29:09 -03:00
7a19f22f41 Replace REPORT.md with production security readiness assessment 2026-03-01 14:56:26 -03:00
c5423fc9c3 Stabilize API routing, CORS, and settings save behavior 2026-03-01 14:27:19 -03:00
3d280396ae Fix LAN API access and dev proxy routing 2026-03-01 14:08:48 -03:00
48cfc79b5f Fix LAN API base and development CORS regression 2026-03-01 13:56:25 -03:00
bdd97d1c62 Harden security controls from REPORT findings 2026-03-01 13:32:08 -03:00
51 changed files with 4857 additions and 692 deletions

72
.env.example Normal file
View File

@@ -0,0 +1,72 @@
# LedgerDock environment template
# Copy to .env and adjust all secret values before first run.
# Development defaults (HTTP local stack)
APP_ENV=development
HOST_BIND_IP=127.0.0.1
# Optional host directory for persistent bind mounts in docker-compose.yml.
# Defaults to ./data when unset.
# DCM_DATA_DIR=./data
POSTGRES_USER=dcm
POSTGRES_PASSWORD=ChangeMe-Postgres-Secret
POSTGRES_DB=dcm
DATABASE_URL=postgresql+psycopg://dcm:ChangeMe-Postgres-Secret@db:5432/dcm
REDIS_PASSWORD=ChangeMe-Redis-Secret
REDIS_URL=redis://:ChangeMe-Redis-Secret@redis:6379/0
REDIS_SECURITY_MODE=compat
REDIS_TLS_MODE=allow_insecure
AUTH_BOOTSTRAP_ADMIN_USERNAME=admin
AUTH_BOOTSTRAP_ADMIN_PASSWORD=ChangeMe-Admin-Password
AUTH_BOOTSTRAP_USER_USERNAME=user
AUTH_BOOTSTRAP_USER_PASSWORD=ChangeMe-User-Password
AUTH_LOGIN_FAILURE_LIMIT=5
AUTH_LOGIN_FAILURE_WINDOW_SECONDS=900
AUTH_LOGIN_LOCKOUT_BASE_SECONDS=30
AUTH_LOGIN_LOCKOUT_MAX_SECONDS=900
# Optional cookie controls for split frontend/api hosts:
# Leave AUTH_COOKIE_DOMAIN empty unless you explicitly need a parent-domain CSRF cookie mirror.
# Host-only auth cookies are issued automatically for the API host.
# AUTH_COOKIE_DOMAIN=docs.lan
# AUTH_COOKIE_SAMESITE=auto
APP_SETTINGS_ENCRYPTION_KEY=ChangeMe-Settings-Encryption-Key
TYPESENSE_API_KEY=ChangeMe-Typesense-Key
PROCESSING_LOG_STORE_MODEL_IO_TEXT=false
PROCESSING_LOG_STORE_PAYLOAD_TEXT=false
CONTENT_EXPORT_MAX_DOCUMENTS=250
CONTENT_EXPORT_MAX_TOTAL_BYTES=52428800
CONTENT_EXPORT_RATE_LIMIT_PER_MINUTE=6
PROVIDER_BASE_URL_ALLOW_HTTP=true
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK=true
PROVIDER_BASE_URL_ALLOWLIST=[]
PUBLIC_BASE_URL=http://localhost:8000
CORS_ORIGINS=["http://localhost:5173","http://localhost:3000"]
# Leave empty to use same-origin /api/v1 through the frontend proxy.
# Set an absolute URL only when you intentionally want split-origin frontend/API traffic.
VITE_API_BASE=
# Development-only Vite proxy target. Docker compose sets this to http://api:8000 automatically.
VITE_API_PROXY_TARGET=http://localhost:8000
# Development-only Vite host allowlist override.
VITE_ALLOWED_HOSTS=
# Production baseline overrides (set explicitly for live deployments):
# APP_ENV=production
# HOST_BIND_IP=127.0.0.1
# REDIS_URL=rediss://:<strong-password>@redis.example.internal:6379/0
# REDIS_SECURITY_MODE=strict
# REDIS_TLS_MODE=required
# AUTH_COOKIE_DOMAIN=example.com
# AUTH_COOKIE_SAMESITE=none
# PROVIDER_BASE_URL_ALLOW_HTTP=false
# PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK=false
# PROVIDER_BASE_URL_ALLOWLIST=["api.openai.com"]
# PUBLIC_BASE_URL=https://api.example.com
# CORS_ORIGINS=["https://app.example.com"]
# VITE_API_BASE=https://api.example.com/api/v1
# VITE_ALLOWED_HOSTS=app.example.com

5
.gitignore vendored
View File

@@ -20,9 +20,8 @@ build/
!.env.example !.env.example
# Data and generated artifacts (runtime only) # Data and generated artifacts (runtime only)
data/postgres/ data/
data/redis/ typesense-data/
data/storage/
# OS / IDE # OS / IDE
.DS_Store .DS_Store

View File

@@ -3,7 +3,7 @@
## Stack Snapshot ## Stack Snapshot
- DMS monorepo with FastAPI API + RQ worker (`backend/`) and React + Vite + TypeScript frontend (`frontend/`). - DMS monorepo with FastAPI API + RQ worker (`backend/`) and React + Vite + TypeScript frontend (`frontend/`).
- Services in `docker-compose.yml`: `api`, `worker`, `frontend`, `db` (Postgres), `redis`, `typesense`. - Services in `docker-compose.yml`: `api`, `worker`, `frontend`, `db` (Postgres), `redis`, `typesense`.
- Runtime persistence uses Docker named volumes (`db-data`, `redis-data`, `dcm-storage`, `typesense-data`). - Runtime persistence uses host bind mounts under `${DCM_DATA_DIR:-./data}` (`db-data`, `redis-data`, `storage`, `typesense-data`).
## Project Layout ## Project Layout
- Backend app code: `backend/app/` (`api/`, `services/`, `db/`, `models/`, `schemas/`, `worker/`). - Backend app code: `backend/app/` (`api/`, `services/`, `db/`, `models/`, `schemas/`, `worker/`).

View File

@@ -3,18 +3,5 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
## [Unreleased]
### Added ### Added
- Initialized `CHANGELOG.md` with Keep a Changelog structure for ongoing release-note tracking. - Initial release
### Changed
- Refreshed `README.md` with current stack details, runtime services, setup commands, configuration notes, and manual validation guidance.
### Deprecated
### Removed
### Fixed
### Security

190
README.md
View File

@@ -1,60 +1,91 @@
# LedgerDock # LedgerDock
LedgerDock is a self-hosted document management system (DMS) for ingesting, processing, organizing, and searching files. LedgerDock is a private document workspace you can run on your own computer or server.
It helps teams collect files, process text from documents, and find information quickly with search.
## Core Capabilities ## What LedgerDock Is For
- Drag and drop upload from anywhere in the UI - Upload files and folders from one place
- File and folder upload with path preservation - Keep documents organized and searchable
- Asynchronous extraction and OCR for PDF, images, DOCX, XLSX, TXT, and ZIP - Extract text from scans and images (OCR)
- Metadata and full-text search - Download originals or extracted text
- Routing suggestions based on previous decisions
- Original file download and extracted markdown export
## Technology Stack ## Before You Start
- Backend: FastAPI, SQLAlchemy, RQ worker (`backend/`) You need:
- Frontend: React, Vite, TypeScript (`frontend/`)
- Infrastructure: PostgreSQL, Redis, Typesense (`docker-compose.yml`)
## Runtime Services - Docker Desktop (Windows or macOS) or Docker Engine + Docker Compose (Linux)
- A terminal app
- The project folder on your machine
- Internet access the first time you build containers
The default `docker compose` stack includes: ## Install With Docker Compose
- `frontend` - React UI (`http://localhost:5173`) Follow these steps from the project folder (where `docker-compose.yml` is located).
- `api` - FastAPI backend (`http://localhost:8000`, docs at `/docs`)
- `worker` - background processing jobs
- `db` - PostgreSQL (`localhost:5432`)
- `redis` - queue backend (`localhost:6379`)
- `typesense` - search index (`localhost:8108`)
## Requirements 1. Create your local settings file from the template.
- Docker Engine ```bash
- Docker Compose plugin cp .env.example .env
- Internet access for first-time image build ```
## Quick Start 2. Open `.env` in a text editor and set your own passwords and keys.
3. Start LedgerDock.
From repository root:
```bash ```bash
docker compose up --build -d docker compose up --build -d
``` ```
Open: 4. Wait until startup is complete, then open the app:
- LedgerDock web app: `http://localhost:5173`
- Health check: `http://localhost:8000/api/v1/health`
5. Sign in with the admin username and password you set in `.env`.
- Frontend: `http://localhost:5173` ## `.env` Settings Explained In Plain Language
- API docs: `http://localhost:8000/docs`
- Health: `http://localhost:8000/api/v1/health`
Stop the stack: LedgerDock reads settings from `.env`. Some values are required and some are optional.
```bash ### Required: Change These Before First Use
docker compose down
```
## Common Operations - `POSTGRES_PASSWORD`: Password for the internal database.
- `REDIS_PASSWORD`: Password for the internal queue service.
- `AUTH_BOOTSTRAP_ADMIN_PASSWORD`: First admin login password.
- `APP_SETTINGS_ENCRYPTION_KEY`: Secret used to protect saved app settings.
- `TYPESENSE_API_KEY`: Secret key for the search engine.
Use long, unique values for each one. Do not reuse personal passwords.
### Required: Usually Keep Defaults Unless You Know You Need Changes
- `POSTGRES_USER`: Database username.
- `POSTGRES_DB`: Database name.
- `DATABASE_URL`: Connection string to the database service.
- `REDIS_URL`: Connection string to the Redis service.
- `AUTH_BOOTSTRAP_ADMIN_USERNAME`: First admin username (default `admin`).
If you change passwords, make sure matching URLs use the same new password.
### Optional User Account (Can Be Left Empty)
- `AUTH_BOOTSTRAP_USER_USERNAME`
- `AUTH_BOOTSTRAP_USER_PASSWORD`
These create an extra non-admin account on first startup.
### Network and Access Settings
- `HOST_BIND_IP`: Where services listen. Keep `127.0.0.1` for local-only access.
- `PUBLIC_BASE_URL`: Backend base URL. Local default is `http://localhost:8000`.
- `CORS_ORIGINS`: Allowed frontend origins. Keep local defaults for single-machine use.
- `VITE_API_BASE`: Frontend API URL override. Leave empty unless you know you need it.
### Environment Mode
- `APP_ENV=development`: Local mode (default).
- `APP_ENV=production`: Use when running as a real shared deployment with HTTPS and tighter security settings.
- Frontend runtime switches to a static build served by Nginx in this mode.
## Daily Use Commands
Start or rebuild: Start or rebuild:
@@ -68,87 +99,50 @@ Stop:
docker compose down docker compose down
``` ```
Tail logs: View logs:
```bash ```bash
docker compose logs -f docker compose logs -f
``` ```
Tail API and worker logs only: View backend logs only:
```bash ```bash
docker compose logs -f api worker docker compose logs -f api worker
``` ```
Reset all runtime data (destructive): ## Where Your Data Is Stored
LedgerDock stores persistent runtime data in host bind mounts. By default the host root is `./data`, or set `DCM_DATA_DIR` to move it:
- `${DCM_DATA_DIR:-./data}/db-data` for PostgreSQL data
- `${DCM_DATA_DIR:-./data}/redis-data` for Redis data
- `${DCM_DATA_DIR:-./data}/storage` for uploaded files and app storage
- `${DCM_DATA_DIR:-./data}/typesense-data` for the search index
On startup, Compose runs a one-shot `storage-init` service that creates the storage tree and applies write access for the backend runtime user `uid=10001`. If you want to inspect or repair it manually, use:
```bash ```bash
docker compose down -v mkdir -p ${DCM_DATA_DIR:-./data}/storage
sudo chown -R 10001:10001 ${DCM_DATA_DIR:-./data}/storage
sudo chmod -R u+rwX,g+rwX ${DCM_DATA_DIR:-./data}/storage
``` ```
## Frontend-Only Local Workflow To remove everything, including data:
If backend services are already running, you can run frontend tooling locally:
```bash ```bash
cd frontend && npm run dev docker compose down
cd frontend && npm run build rm -rf ${DCM_DATA_DIR:-./data}
cd frontend && npm run preview
``` ```
`npm run preview` serves the built app on port `4173`. Warning: this permanently deletes your LedgerDock data on this machine.
## Configuration ## First Checks After Install
Main runtime variables are defined in `docker-compose.yml`: - Open `http://localhost:5173` and confirm the login page appears.
- Open `http://localhost:8000/api/v1/health` and confirm you get `{"status":"ok"}`.
- Upload one sample file and confirm it appears in search.
- API and worker: `DATABASE_URL`, `REDIS_URL`, `STORAGE_ROOT`, `PUBLIC_BASE_URL`, `CORS_ORIGINS`, `TYPESENSE_*` ## Need Technical Documentation?
- Frontend: `VITE_API_BASE`
Application settings saved from the UI persist at: Developer and operator docs are in `doc/`, starting at `doc/README.md`.
- `<STORAGE_ROOT>/settings.json` (inside the storage volume)
Settings endpoints:
- `GET/PUT /api/v1/settings`
- `POST /api/v1/settings/reset`
- `POST /api/v1/settings/handwriting`
- `POST /api/v1/processing/logs/trim`
Note: the compose file currently includes host-specific URL values (for example `PUBLIC_BASE_URL` and `VITE_API_BASE`). Adjust these for your environment when needed.
## Data Persistence
Docker named volumes used by the stack:
- `db-data`
- `redis-data`
- `dcm-storage`
- `typesense-data`
## Validation Checklist
After setup or config changes, verify:
- `GET /api/v1/health` returns `{"status":"ok"}`
- Upload and processing complete successfully
- Search returns expected results
- Preview and download work for uploaded documents
- `docker compose logs -f api worker` has no failures
## Repository Layout
- `backend/` - FastAPI API, services, models, worker
- `frontend/` - React application
- `doc/` - technical documentation for architecture, API, data model, and operations
- `docker-compose.yml` - local runtime topology
## Documentation Index
- `doc/README.md` - technical documentation entrypoint
- `doc/architecture-overview.md` - service and runtime architecture
- `doc/api-contract.md` - endpoint and payload contract
- `doc/data-model-reference.md` - persistence model reference
- `doc/operations-and-configuration.md` - runtime operations and configuration
- `doc/frontend-design-foundation.md` - frontend design rules

109
REPORT.md
View File

@@ -1,109 +0,0 @@
# Security Production Readiness Report
Date: 2026-03-01
Repository: /Users/bedas/Developer/GitHub/dcm
Review Type: Static security review for production readiness
## Scope
- Backend: FastAPI API, worker queue, settings and model runtime services
- Frontend: React and Vite API client and document preview rendering
- Infrastructure: docker-compose service exposure and secret configuration
## Findings
### Critical
1. Redis queue is exposed without authentication and can be abused for worker job injection.
- Impact: If Redis is reachable by an attacker, queued job payloads can be injected and executed by the worker process, leading to remote code execution and data compromise.
- Exploit path: Reach Redis on port 6379, enqueue crafted RQ jobs into queue dcm, wait for worker consumption.
- Evidence:
- docker-compose publishes Redis host port: `docker-compose.yml:21`
- worker consumes from Redis queue directly: `docker-compose.yml:77`
- queue connection uses bare Redis URL with no auth/TLS: `backend/app/worker/queue.py:15`, `backend/app/worker/queue.py:21`
- current environment binds services to all interfaces: `.env:1`
- Remediation:
- Do not publish Redis externally in production.
- Enforce Redis authentication and TLS.
- Place Redis on a private network segment with strict ACLs.
- Treat queue producers as privileged components only.
2. Untrusted uploaded content is previewed in an unsandboxed iframe.
- Impact: Stored XSS and active content execution in preview context can enable account action abuse and data exfiltration in the browser.
- Exploit path: Upload active content (for example HTML), open preview, script executes in iframe without sandbox constraints.
- Evidence:
- upload endpoint accepts generic uploaded files: `backend/app/api/routes_documents.py:493`
- MIME type is derived from bytes and persisted: `backend/app/api/routes_documents.py:530`
- preview endpoint returns original bytes inline with stored media type: `backend/app/api/routes_documents.py:449`, `backend/app/api/routes_documents.py:457`
- frontend renders preview in iframe without sandbox attribute: `frontend/src/components/DocumentViewer.tsx:486`
- preview source is a blob URL created from fetched content: `frontend/src/components/DocumentViewer.tsx:108`, `frontend/src/components/DocumentViewer.tsx:113`
- Remediation:
- Block inline preview for script-capable MIME types.
- Add strict iframe sandboxing if iframe preview remains required.
- Prefer force-download for active formats.
- Serve untrusted preview content from an isolated origin with restrictive CSP.
### High
1. Frontend distributes a bearer token to all clients.
- Impact: Any user with browser access can extract the token and replay authenticated calls, preventing per-user accountability and increasing blast radius.
- Exploit path: Read token from frontend runtime environment or request headers, replay API requests with Authorization header.
- Evidence:
- frontend consumes token from public Vite env: `frontend/src/lib/api.ts:24`
- token is attached to every request when present: `frontend/src/lib/api.ts:38`
- compose passes `VITE_API_TOKEN` from user token: `docker-compose.yml:115`
- privileged routes rely on static token role checks: `backend/app/api/router.py:19`, `backend/app/api/auth.py:47`, `backend/app/api/auth.py:51`
- Remediation:
- Replace shared static token model with per-user authentication.
- Keep secrets server-side only.
- Use short-lived credentials with rotation and revocation.
2. Default and static service secrets are present in deploy config.
- Impact: If service ports are exposed, predictable credentials and keys allow unauthorized access to data services.
- Exploit path: Connect to published Postgres or Typesense ports and authenticate with known static values.
- Evidence:
- static Postgres credentials: `docker-compose.yml:5`, `docker-compose.yml:6`
- static Typesense key in compose and runtime env: `docker-compose.yml:29`, `docker-compose.yml:55`, `docker-compose.yml:93`
- database and Typesense ports are published to host: `docker-compose.yml:9`, `docker-compose.yml:32`
- current environment uses placeholder tokens: `.env:2`, `.env:3`, `.env:4`
- Remediation:
- Use high-entropy secrets managed outside repository configuration.
- Remove unnecessary host port publications in production.
- Restrict service network access to trusted internal components.
3. ZIP recursion depth control is not enforced across queued descendants.
- Impact: Nested archives can create uncontrolled fan-out, causing CPU, queue, and storage exhaustion.
- Exploit path: Upload ZIP containing ZIPs; children are queued as independent documents without inherited depth, repeating recursively.
- Evidence:
- configured depth limit exists: `backend/app/core/config.py:28`
- extractor takes a depth argument but is called without propagation: `backend/app/services/extractor.py:302`, `backend/app/services/extractor.py:306`
- worker invokes extractor without depth context: `backend/app/worker/tasks.py:122`
- worker enqueues child archive jobs recursively: `backend/app/worker/tasks.py:225`, `backend/app/worker/tasks.py:226`
- Remediation:
- Persist and propagate archive depth per document lineage.
- Enforce absolute descendant and fan-out limits per root upload.
- Reject nested archives beyond configured depth.
### Medium
1. OCR provider path does not apply DNS revalidation equivalent to model runtime path.
- Impact: Under permissive network flags, SSRF defenses can be weakened by DNS rebinding on OCR traffic.
- Exploit path: Persist provider URL that passes initial checks, then rebind DNS to private target before OCR requests.
- Evidence:
- task model runtime enforces `resolve_dns=True`: `backend/app/services/model_runtime.py:41`
- provider normalization in app settings does not pass DNS revalidation flag: `backend/app/services/app_settings.py:253`
- OCR runtime uses persisted URL for client base URL: `backend/app/services/app_settings.py:891`, `backend/app/services/handwriting.py:159`
- Remediation:
- Apply DNS revalidation before outbound OCR requests or on every runtime load.
- Disallow private network egress by default and require explicit controlled exceptions.
2. Provider API keys are persisted in plaintext settings on storage volume.
- Impact: File system or backup compromise reveals upstream provider secrets.
- Exploit path: Read persisted settings file from storage volume or backup artifact.
- Evidence:
- settings file location under storage root: `backend/app/services/app_settings.py:133`
- provider payload includes plaintext `api_key`: `backend/app/services/app_settings.py:268`
- settings payload is written to disk as JSON: `backend/app/services/app_settings.py:680`, `backend/app/services/app_settings.py:685`
- OCR settings read returns stored API key value for runtime: `backend/app/services/app_settings.py:894`
- Remediation:
- Move provider secrets to dedicated secret management.
- If local persistence is unavoidable, encrypt sensitive fields at rest and restrict file permissions.

View File

@@ -1,15 +1,30 @@
APP_ENV=development APP_ENV=development
DATABASE_URL=postgresql+psycopg://dcm:dcm@db:5432/dcm DATABASE_URL=postgresql+psycopg://dcm:dcm@db:5432/dcm
REDIS_URL=redis://redis:6379/0 REDIS_URL=redis://:replace-with-redis-password@redis:6379/0
REDIS_SECURITY_MODE=auto
REDIS_TLS_MODE=auto
STORAGE_ROOT=/data/storage STORAGE_ROOT=/data/storage
ADMIN_API_TOKEN=replace-with-random-admin-token AUTH_BOOTSTRAP_ADMIN_USERNAME=admin
USER_API_TOKEN=replace-with-random-user-token AUTH_BOOTSTRAP_ADMIN_PASSWORD=replace-with-random-admin-password
AUTH_BOOTSTRAP_USER_USERNAME=user
AUTH_BOOTSTRAP_USER_PASSWORD=replace-with-random-user-password
AUTH_LOGIN_FAILURE_LIMIT=5
AUTH_LOGIN_FAILURE_WINDOW_SECONDS=900
AUTH_LOGIN_LOCKOUT_BASE_SECONDS=30
AUTH_LOGIN_LOCKOUT_MAX_SECONDS=900
APP_SETTINGS_ENCRYPTION_KEY=replace-with-random-settings-encryption-key
PROCESSING_LOG_STORE_MODEL_IO_TEXT=false
PROCESSING_LOG_STORE_PAYLOAD_TEXT=false
CONTENT_EXPORT_MAX_DOCUMENTS=250
CONTENT_EXPORT_MAX_TOTAL_BYTES=52428800
CONTENT_EXPORT_RATE_LIMIT_PER_MINUTE=6
MAX_UPLOAD_FILES_PER_REQUEST=50 MAX_UPLOAD_FILES_PER_REQUEST=50
MAX_UPLOAD_FILE_SIZE_BYTES=26214400 MAX_UPLOAD_FILE_SIZE_BYTES=26214400
MAX_UPLOAD_REQUEST_SIZE_BYTES=104857600 MAX_UPLOAD_REQUEST_SIZE_BYTES=104857600
MAX_ZIP_MEMBER_UNCOMPRESSED_BYTES=26214400 MAX_ZIP_MEMBER_UNCOMPRESSED_BYTES=26214400
MAX_ZIP_TOTAL_UNCOMPRESSED_BYTES=157286400 MAX_ZIP_TOTAL_UNCOMPRESSED_BYTES=157286400
MAX_ZIP_COMPRESSION_RATIO=120 MAX_ZIP_COMPRESSION_RATIO=120
MAX_ZIP_DESCENDANTS_PER_ROOT=1000
PROVIDER_BASE_URL_ALLOWLIST=["api.openai.com"] PROVIDER_BASE_URL_ALLOWLIST=["api.openai.com"]
PROVIDER_BASE_URL_ALLOW_HTTP=false PROVIDER_BASE_URL_ALLOW_HTTP=false
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK=false PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK=false
@@ -23,6 +38,6 @@ DEFAULT_ROUTING_MODEL=gpt-4.1-mini
TYPESENSE_PROTOCOL=http TYPESENSE_PROTOCOL=http
TYPESENSE_HOST=typesense TYPESENSE_HOST=typesense
TYPESENSE_PORT=8108 TYPESENSE_PORT=8108
TYPESENSE_API_KEY=dcm-typesense-key TYPESENSE_API_KEY=replace-with-random-typesense-api-key
TYPESENSE_COLLECTION_NAME=documents TYPESENSE_COLLECTION_NAME=documents
PUBLIC_BASE_URL=http://localhost:8000 PUBLIC_BASE_URL=http://localhost:8000

View File

@@ -1,87 +1,169 @@
"""Token-based authentication and authorization dependencies for privileged API routes.""" """Authentication and authorization dependencies for protected API routes."""
from dataclasses import dataclass
from datetime import datetime
from typing import Annotated
from uuid import UUID
import hmac import hmac
from typing import Annotated from fastapi import Depends, HTTPException, Request, status
from fastapi import Depends, HTTPException, status
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from sqlalchemy.orm import Session
from app.core.config import Settings, get_settings from app.db.base import get_session
from app.models.auth import UserRole
from app.services.authentication import resolve_auth_session
try:
from fastapi import Cookie, Header
except (ImportError, AttributeError):
def Cookie(_default=None, **_kwargs): # type: ignore[no-untyped-def]
"""Compatibility fallback for environments that stub fastapi without request params."""
return None
def Header(_default=None, **_kwargs): # type: ignore[no-untyped-def]
"""Compatibility fallback for environments that stub fastapi without request params."""
return None
bearer_auth = HTTPBearer(auto_error=False) bearer_auth = HTTPBearer(auto_error=False)
SESSION_COOKIE_NAME = "dcm_session"
CSRF_COOKIE_NAME = "dcm_csrf"
CSRF_HEADER_NAME = "x-csrf-token"
CSRF_PROTECTED_METHODS = frozenset({"POST", "PATCH", "PUT", "DELETE"})
class AuthRole: @dataclass(frozen=True)
"""Declares supported authorization roles for privileged API operations.""" class AuthContext:
"""Carries authenticated identity and role details for one request."""
ADMIN = "admin" user_id: UUID
USER = "user" username: str
role: UserRole
session_id: UUID
expires_at: datetime
def _requires_csrf_validation(method: str) -> bool:
"""Returns whether an HTTP method should be protected by cookie CSRF validation."""
return method.upper() in CSRF_PROTECTED_METHODS
def _extract_cookie_values(request: Request, cookie_name: str) -> tuple[str, ...]:
"""Extracts all values for one cookie name from raw Cookie header order."""
request_headers = getattr(request, "headers", None)
raw_cookie_header = request_headers.get("cookie", "") if request_headers is not None else ""
if not raw_cookie_header:
return ()
extracted_values: list[str] = []
for cookie_pair in raw_cookie_header.split(";"):
normalized_pair = cookie_pair.strip()
if not normalized_pair or "=" not in normalized_pair:
continue
key, value = normalized_pair.split("=", 1)
if key.strip() != cookie_name:
continue
normalized_value = value.strip()
if normalized_value:
extracted_values.append(normalized_value)
return tuple(extracted_values)
def _raise_unauthorized() -> None: def _raise_unauthorized() -> None:
"""Raises an HTTP 401 response with bearer authentication challenge headers.""" """Raises a 401 challenge response for missing or invalid auth sessions."""
raise HTTPException( raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED, status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid or missing API token", detail="Invalid or expired authentication session",
headers={"WWW-Authenticate": "Bearer"}, headers={"WWW-Authenticate": "Bearer"},
) )
def _configured_admin_token(settings: Settings) -> str: def _raise_csrf_rejected() -> None:
"""Returns required admin token or raises configuration error when unset.""" """Raises a forbidden response for CSRF validation failure."""
token = settings.admin_api_token.strip()
if token:
return token
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Admin API token is not configured",
)
def _resolve_token_role(token: str, settings: Settings) -> str:
"""Resolves role from a bearer token using constant-time comparisons."""
admin_token = _configured_admin_token(settings)
if hmac.compare_digest(token, admin_token):
return AuthRole.ADMIN
user_token = settings.user_api_token.strip()
if user_token and hmac.compare_digest(token, user_token):
return AuthRole.USER
_raise_unauthorized()
def get_request_role(
credentials: Annotated[HTTPAuthorizationCredentials | None, Depends(bearer_auth)],
settings: Annotated[Settings, Depends(get_settings)],
) -> str:
"""Authenticates request token and returns its authorization role."""
if credentials is None:
_raise_unauthorized()
token = credentials.credentials.strip()
if not token:
_raise_unauthorized()
return _resolve_token_role(token=token, settings=settings)
def require_user_or_admin(role: Annotated[str, Depends(get_request_role)]) -> str:
"""Requires a valid user or admin token and returns resolved role."""
return role
def require_admin(role: Annotated[str, Depends(get_request_role)]) -> str:
"""Requires admin role and rejects requests authenticated as regular users."""
if role != AuthRole.ADMIN:
raise HTTPException( raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN, status_code=status.HTTP_403_FORBIDDEN,
detail="Admin token required", detail="Invalid CSRF token",
) )
return role
def get_request_auth_context(
request: Request,
credentials: HTTPAuthorizationCredentials | None = Depends(bearer_auth),
csrf_header: str | None = Header(None, alias=CSRF_HEADER_NAME),
csrf_cookie: str | None = Cookie(None, alias=CSRF_COOKIE_NAME),
session_cookie: str | None = Cookie(None, alias=SESSION_COOKIE_NAME),
session: Session = Depends(get_session),
) -> AuthContext:
"""Authenticates auth session token and validates CSRF for cookie sessions."""
token = credentials.credentials.strip() if credentials is not None and credentials.credentials else ""
using_cookie_session = False
session_candidates: list[str] = []
if not token:
using_cookie_session = True
session_candidates = [candidate for candidate in _extract_cookie_values(request, SESSION_COOKIE_NAME) if candidate]
normalized_session_cookie = (session_cookie or "").strip()
if normalized_session_cookie and normalized_session_cookie not in session_candidates:
session_candidates.append(normalized_session_cookie)
if not session_candidates:
_raise_unauthorized()
if _requires_csrf_validation(request.method) and using_cookie_session:
normalized_csrf_header = (csrf_header or "").strip()
csrf_candidates = [candidate for candidate in _extract_cookie_values(request, CSRF_COOKIE_NAME) if candidate]
normalized_csrf_cookie = (csrf_cookie or "").strip()
if normalized_csrf_cookie and normalized_csrf_cookie not in csrf_candidates:
csrf_candidates.append(normalized_csrf_cookie)
if (
not csrf_candidates
or not normalized_csrf_header
or not any(hmac.compare_digest(candidate, normalized_csrf_header) for candidate in csrf_candidates)
):
_raise_csrf_rejected()
resolved_session = None
if token:
resolved_session = resolve_auth_session(session, token=token)
else:
for candidate in session_candidates:
resolved_session = resolve_auth_session(session, token=candidate)
if resolved_session is not None and resolved_session.user is not None:
break
if resolved_session is None or resolved_session.user is None:
_raise_unauthorized()
return AuthContext(
user_id=resolved_session.user.id,
username=resolved_session.user.username,
role=resolved_session.user.role,
session_id=resolved_session.id,
expires_at=resolved_session.expires_at,
)
def require_user_or_admin(context: Annotated[AuthContext, Depends(get_request_auth_context)]) -> AuthContext:
"""Requires any authenticated user session and returns its request identity context."""
return context
def require_admin(context: Annotated[AuthContext, Depends(get_request_auth_context)]) -> AuthContext:
"""Requires authenticated admin role and rejects standard user sessions."""
if context.role != UserRole.ADMIN:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Administrator role required",
)
return context

View File

@@ -2,7 +2,8 @@
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from app.api.auth import require_admin, require_user_or_admin from app.api.auth import require_admin
from app.api.routes_auth import router as auth_router
from app.api.routes_documents import router as documents_router from app.api.routes_documents import router as documents_router
from app.api.routes_health import router as health_router from app.api.routes_health import router as health_router
from app.api.routes_processing_logs import router as processing_logs_router from app.api.routes_processing_logs import router as processing_logs_router
@@ -12,11 +13,11 @@ from app.api.routes_settings import router as settings_router
api_router = APIRouter() api_router = APIRouter()
api_router.include_router(health_router) api_router.include_router(health_router)
api_router.include_router(auth_router)
api_router.include_router( api_router.include_router(
documents_router, documents_router,
prefix="/documents", prefix="/documents",
tags=["documents"], tags=["documents"],
dependencies=[Depends(require_user_or_admin)],
) )
api_router.include_router( api_router.include_router(
processing_logs_router, processing_logs_router,
@@ -28,7 +29,6 @@ api_router.include_router(
search_router, search_router,
prefix="/search", prefix="/search",
tags=["search"], tags=["search"],
dependencies=[Depends(require_user_or_admin)],
) )
api_router.include_router( api_router.include_router(
settings_router, settings_router,

View File

@@ -0,0 +1,349 @@
"""Authentication endpoints for credential login, session introspection, and logout."""
import logging
import secrets
from datetime import UTC, datetime
from urllib.parse import urlparse
from fastapi import APIRouter, Depends, HTTPException, Request, status
from sqlalchemy.orm import Session
from app.api.auth import (
AuthContext,
SESSION_COOKIE_NAME,
CSRF_COOKIE_NAME,
require_user_or_admin,
)
from app.core.config import get_settings
from app.db.base import get_session
from app.schemas.auth import (
AuthLoginRequest,
AuthLoginResponse,
AuthLogoutResponse,
AuthSessionResponse,
AuthUserResponse,
)
from app.services.auth_login_throttle import (
check_login_throttle,
clear_login_throttle,
record_failed_login_attempt,
)
try:
from fastapi import Cookie, Response
except (ImportError, AttributeError):
from fastapi.responses import Response
def Cookie(_default=None, **_kwargs): # type: ignore[no-untyped-def]
"""Compatibility fallback for environments that stub fastapi without request params."""
return None
from app.services.authentication import authenticate_user, issue_user_session, revoke_auth_session
router = APIRouter(prefix="/auth", tags=["auth"])
logger = logging.getLogger(__name__)
LOGIN_THROTTLED_DETAIL = "Too many login attempts. Try again later."
LOGIN_RATE_LIMITER_UNAVAILABLE_DETAIL = "Login rate limiter backend unavailable"
def _request_ip_address(request: Request) -> str | None:
"""Returns best-effort client IP extracted from the request transport context."""
return request.client.host if request.client is not None else None
def _request_user_agent(request: Request) -> str | None:
"""Returns best-effort user-agent metadata for created auth sessions."""
user_agent = request.headers.get("user-agent", "").strip()
return user_agent[:512] if user_agent else None
def _retry_after_headers(retry_after_seconds: int) -> dict[str, str]:
"""Returns a bounded Retry-After header payload for throttled authentication responses."""
return {"Retry-After": str(max(1, int(retry_after_seconds)))}
def _is_https_request(request: Request) -> bool:
"""Returns whether the incoming request should be treated as HTTPS for cookie flags."""
forwarded_protocol = request.headers.get("x-forwarded-proto", "").strip().lower().split(",")[0]
if forwarded_protocol:
return forwarded_protocol == "https"
request_url = getattr(request, "url", None)
request_scheme = str(getattr(request_url, "scheme", "")).lower() if request_url is not None else ""
if request_scheme == "https":
return True
parsed_public_base_url = urlparse(get_settings().public_base_url.strip())
return parsed_public_base_url.scheme.lower() == "https"
def _resolve_cookie_domain() -> str | None:
"""Returns optional cookie domain override for multi-subdomain deployments."""
configured_domain = get_settings().auth_cookie_domain.strip().lower().lstrip(".")
if not configured_domain or "." not in configured_domain:
return None
return configured_domain
def _resolve_cookie_domains() -> tuple[str | None, ...]:
"""Returns cookie domain variants with a host-only cookie first for browser compatibility."""
configured_domain = _resolve_cookie_domain()
if configured_domain is None:
return (None,)
return (None, configured_domain)
def _request_matches_cookie_domain(request: Request) -> bool:
"""Returns whether request and origin hosts both sit under the configured cookie domain."""
configured_domain = _resolve_cookie_domain()
if configured_domain is None:
return False
origin_header = request.headers.get("origin", "").strip()
origin_host = urlparse(origin_header).hostname.strip().lower() if origin_header else ""
if not origin_host:
return False
request_url = getattr(request, "url", None)
request_host = str(getattr(request_url, "hostname", "")).strip().lower() if request_url is not None else ""
if not request_host:
parsed_public_base_url = urlparse(get_settings().public_base_url.strip())
request_host = parsed_public_base_url.hostname.strip().lower() if parsed_public_base_url.hostname else ""
if not request_host:
return False
def _matches(candidate: str) -> bool:
return candidate == configured_domain or candidate.endswith(f".{configured_domain}")
return _matches(origin_host) and _matches(request_host)
def _resolve_cookie_samesite(request: Request, secure_cookie: bool) -> str:
"""Returns cookie SameSite mode with same-site subdomain compatibility defaults."""
configured_mode = get_settings().auth_cookie_samesite.strip().lower()
if configured_mode in {"strict", "lax"}:
return configured_mode
if configured_mode == "none":
return "lax" if _request_matches_cookie_domain(request) else "none"
return "none" if secure_cookie else "lax"
def _session_cookie_ttl_seconds(expires_at: datetime) -> int:
"""Converts session expiration datetime into cookie max-age seconds."""
now = datetime.now(UTC)
ttl = int((expires_at - now).total_seconds())
return max(1, ttl)
def _set_session_cookie(
response: Response,
session_token: str,
*,
request: Request,
expires_at: datetime,
secure: bool,
) -> None:
"""Stores the issued session token in a browser HttpOnly auth cookie."""
if response is None or not hasattr(response, "set_cookie"):
return
expires_seconds = _session_cookie_ttl_seconds(expires_at)
same_site_mode = _resolve_cookie_samesite(request, secure)
for cookie_domain in _resolve_cookie_domains():
cookie_kwargs = {
"value": session_token,
"max_age": expires_seconds,
"httponly": True,
"secure": secure,
"samesite": same_site_mode,
"path": "/",
}
if cookie_domain is not None:
cookie_kwargs["domain"] = cookie_domain
response.set_cookie(SESSION_COOKIE_NAME, **cookie_kwargs)
def _set_csrf_cookie(
response: Response,
csrf_token: str,
*,
request: Request,
expires_at: datetime,
secure: bool,
) -> None:
"""Stores an anti-CSRF token in a browser cookie for JavaScript-safe extraction."""
if response is None or not hasattr(response, "set_cookie"):
return
same_site_mode = _resolve_cookie_samesite(request, secure)
for cookie_domain in _resolve_cookie_domains():
cookie_kwargs = {
"value": csrf_token,
"max_age": _session_cookie_ttl_seconds(expires_at),
"httponly": False,
"secure": secure,
"samesite": same_site_mode,
"path": "/",
}
if cookie_domain is not None:
cookie_kwargs["domain"] = cookie_domain
response.set_cookie(CSRF_COOKIE_NAME, **cookie_kwargs)
def _clear_session_cookies(response: Response) -> None:
"""Clears auth cookies returned by login responses."""
if response is None or not hasattr(response, "delete_cookie"):
return
for cookie_domain in _resolve_cookie_domains():
delete_kwargs = {"path": "/"}
if cookie_domain is not None:
delete_kwargs["domain"] = cookie_domain
response.delete_cookie(SESSION_COOKIE_NAME, **delete_kwargs)
response.delete_cookie(CSRF_COOKIE_NAME, **delete_kwargs)
@router.post("/login", response_model=AuthLoginResponse)
def login(
payload: AuthLoginRequest,
request: Request,
response: Response,
session: Session = Depends(get_session),
) -> AuthLoginResponse:
"""Authenticates credentials with throttle protection and returns issued session metadata."""
ip_address = _request_ip_address(request)
try:
throttle_status = check_login_throttle(
username=payload.username,
ip_address=ip_address,
)
except RuntimeError as error:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=LOGIN_RATE_LIMITER_UNAVAILABLE_DETAIL,
) from error
if throttle_status.is_throttled:
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail=LOGIN_THROTTLED_DETAIL,
headers=_retry_after_headers(throttle_status.retry_after_seconds),
)
user = authenticate_user(
session,
username=payload.username,
password=payload.password,
)
if user is None:
try:
lockout_seconds = record_failed_login_attempt(
username=payload.username,
ip_address=ip_address,
)
except RuntimeError as error:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=LOGIN_RATE_LIMITER_UNAVAILABLE_DETAIL,
) from error
if lockout_seconds > 0:
raise HTTPException(
status_code=status.HTTP_429_TOO_MANY_REQUESTS,
detail=LOGIN_THROTTLED_DETAIL,
headers=_retry_after_headers(lockout_seconds),
)
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid username or password",
)
try:
clear_login_throttle(
username=payload.username,
ip_address=ip_address,
)
except RuntimeError:
logger.warning(
"Failed to clear login throttle state after successful authentication: username=%s ip=%s",
payload.username.strip().lower(),
ip_address or "",
)
issued_session = issue_user_session(
session,
user=user,
user_agent=_request_user_agent(request),
ip_address=ip_address,
)
session.commit()
csrf_token = secrets.token_urlsafe(32)
secure_cookie = _is_https_request(request)
_set_session_cookie(
response,
issued_session.token,
request=request,
expires_at=issued_session.expires_at,
secure=secure_cookie,
)
_set_csrf_cookie(
response,
csrf_token,
request=request,
expires_at=issued_session.expires_at,
secure=secure_cookie,
)
return AuthLoginResponse(
user=AuthUserResponse.model_validate(user),
expires_at=issued_session.expires_at,
access_token=issued_session.token,
csrf_token=csrf_token,
)
@router.get("/me", response_model=AuthSessionResponse)
def me(
context: AuthContext = Depends(require_user_or_admin),
csrf_cookie: str | None = Cookie(None, alias=CSRF_COOKIE_NAME),
) -> AuthSessionResponse:
"""Returns current authenticated session identity and expiration metadata."""
normalized_csrf_cookie = (csrf_cookie or "").strip() or None
return AuthSessionResponse(
expires_at=context.expires_at,
user=AuthUserResponse(
id=context.user_id,
username=context.username,
role=context.role,
),
csrf_token=normalized_csrf_cookie,
)
@router.post("/logout", response_model=AuthLogoutResponse)
def logout(
response: Response,
context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> AuthLogoutResponse:
"""Revokes current session token and clears client auth cookies."""
revoked = revoke_auth_session(
session,
session_id=context.session_id,
)
if revoked:
session.commit()
_clear_session_cookies(response)
return AuthLogoutResponse(revoked=revoked)

View File

@@ -1,12 +1,12 @@
"""Authenticated document CRUD, lifecycle, metadata, file access, and content export endpoints.""" """Authenticated document CRUD, lifecycle, metadata, file access, and content export endpoints."""
import io
import re import re
import tempfile
import unicodedata import unicodedata
import zipfile import zipfile
from datetime import datetime, time from datetime import datetime, time
from pathlib import Path from pathlib import Path
from typing import Annotated, Literal from typing import Annotated, BinaryIO, Iterator, Literal
from uuid import UUID from uuid import UUID
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
@@ -14,8 +14,10 @@ from fastapi.responses import FileResponse, Response, StreamingResponse
from sqlalchemy import or_, func, select from sqlalchemy import or_, func, select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.core.config import get_settings from app.api.auth import AuthContext, require_user_or_admin
from app.core.config import get_settings, is_inline_preview_mime_type_safe
from app.db.base import get_session from app.db.base import get_session
from app.models.auth import UserRole
from app.models.document import Document, DocumentStatus from app.models.document import Document, DocumentStatus
from app.schemas.documents import ( from app.schemas.documents import (
ContentExportRequest, ContentExportRequest,
@@ -30,6 +32,7 @@ from app.services.app_settings import read_predefined_paths_settings, read_prede
from app.services.extractor import sniff_mime from app.services.extractor import sniff_mime
from app.services.handwriting_style import delete_many_handwriting_style_documents from app.services.handwriting_style import delete_many_handwriting_style_documents
from app.services.processing_logs import log_processing_event, set_processing_log_autocommit from app.services.processing_logs import log_processing_event, set_processing_log_autocommit
from app.services.rate_limiter import increment_rate_limit
from app.services.storage import absolute_path, compute_sha256, store_bytes from app.services.storage import absolute_path, compute_sha256, store_bytes
from app.services.typesense_index import delete_many_documents_index, upsert_document_index from app.services.typesense_index import delete_many_documents_index, upsert_document_index
from app.worker.queue import get_processing_queue from app.worker.queue import get_processing_queue
@@ -39,6 +42,84 @@ router = APIRouter()
settings = get_settings() settings = get_settings()
def _scope_document_statement_for_auth_context(statement, auth_context: AuthContext):
"""Restricts document statements to caller-owned rows for non-admin users."""
if auth_context.role == UserRole.ADMIN:
return statement
return statement.where(Document.owner_user_id == auth_context.user_id)
def _is_predefined_entry_visible_to_auth_context(entry: dict[str, object], auth_context: AuthContext) -> bool:
"""Returns whether one predefined catalog entry is visible to the active caller role."""
if auth_context.role == UserRole.ADMIN:
return True
return bool(entry.get("global_shared", False))
def _collect_visible_predefined_values(
entries: list[dict[str, object]],
*,
auth_context: AuthContext,
) -> set[str]:
"""Collects normalized predefined values visible for the active caller role."""
visible_values: set[str] = set()
for entry in entries:
if not _is_predefined_entry_visible_to_auth_context(entry, auth_context):
continue
normalized = str(entry.get("value", "")).strip()
if normalized:
visible_values.add(normalized)
return visible_values
def _ensure_document_access(document: Document, auth_context: AuthContext) -> None:
"""Enforces owner-level access for non-admin users and raises not-found on violations."""
if auth_context.role == UserRole.ADMIN:
return
if document.owner_user_id != auth_context.user_id:
raise HTTPException(status_code=404, detail="Document not found")
def _stream_binary_file_chunks(handle: BinaryIO, *, chunk_bytes: int) -> Iterator[bytes]:
"""Streams binary file-like content in bounded chunks and closes handle after completion."""
try:
while True:
chunk = handle.read(chunk_bytes)
if not chunk:
break
yield chunk
finally:
handle.close()
def _enforce_content_export_rate_limit(auth_context: AuthContext) -> None:
"""Applies per-user fixed-window rate limiting for markdown export requests."""
try:
current_count, limit = increment_rate_limit(
scope="content-md-export",
subject=str(auth_context.user_id),
limit=settings.content_export_rate_limit_per_minute,
window_seconds=60,
)
except RuntimeError as error:
raise HTTPException(
status_code=503,
detail="Rate limiter backend unavailable",
) from error
if limit > 0 and current_count > limit:
raise HTTPException(
status_code=429,
detail=f"Export rate limit exceeded ({limit} requests per minute)",
)
def _parse_csv(value: str | None) -> list[str]: def _parse_csv(value: str | None) -> list[str]:
"""Parses comma-separated query values into a normalized non-empty list.""" """Parses comma-separated query values into a normalized non-empty list."""
@@ -296,6 +377,7 @@ def list_documents(
type_filter: str | None = Query(default=None), type_filter: str | None = Query(default=None),
processed_from: str | None = Query(default=None), processed_from: str | None = Query(default=None),
processed_to: str | None = Query(default=None), processed_to: str | None = Query(default=None),
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> DocumentsListResponse: ) -> DocumentsListResponse:
"""Returns paginated documents ordered by newest upload timestamp.""" """Returns paginated documents ordered by newest upload timestamp."""
@@ -305,6 +387,7 @@ def list_documents(
include_trashed=include_trashed, include_trashed=include_trashed,
path_prefix=path_prefix, path_prefix=path_prefix,
) )
base_statement = _scope_document_statement_for_auth_context(base_statement, auth_context)
base_statement = _apply_discovery_filters( base_statement = _apply_discovery_filters(
base_statement, base_statement,
path_filter=path_filter, path_filter=path_filter,
@@ -326,20 +409,23 @@ def list_documents(
@router.get("/tags") @router.get("/tags")
def list_tags( def list_tags(
include_trashed: bool = Query(default=False), include_trashed: bool = Query(default=False),
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> dict[str, list[str]]: ) -> dict[str, list[str]]:
"""Returns distinct tags currently assigned across all matching documents.""" """Returns distinct tags currently assigned across all matching documents."""
statement = select(Document.tags) statement = select(Document.tags)
statement = _scope_document_statement_for_auth_context(statement, auth_context)
if not include_trashed: if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED) statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).scalars().all() rows = session.execute(statement).scalars().all()
tags = {tag for row in rows for tag in row if tag} tags = {tag for row in rows for tag in row if tag}
tags.update( tags.update(
str(item.get("value", "")).strip() _collect_visible_predefined_values(
for item in read_predefined_tags_settings() read_predefined_tags_settings(),
if str(item.get("value", "")).strip() auth_context=auth_context,
)
) )
tags = sorted(tags) tags = sorted(tags)
return {"tags": tags} return {"tags": tags}
@@ -348,20 +434,23 @@ def list_tags(
@router.get("/paths") @router.get("/paths")
def list_paths( def list_paths(
include_trashed: bool = Query(default=False), include_trashed: bool = Query(default=False),
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> dict[str, list[str]]: ) -> dict[str, list[str]]:
"""Returns distinct logical paths currently assigned across all matching documents.""" """Returns distinct logical paths currently assigned across all matching documents."""
statement = select(Document.logical_path) statement = select(Document.logical_path)
statement = _scope_document_statement_for_auth_context(statement, auth_context)
if not include_trashed: if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED) statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).scalars().all() rows = session.execute(statement).scalars().all()
paths = {row for row in rows if row} paths = {row for row in rows if row}
paths.update( paths.update(
str(item.get("value", "")).strip() _collect_visible_predefined_values(
for item in read_predefined_paths_settings() read_predefined_paths_settings(),
if str(item.get("value", "")).strip() auth_context=auth_context,
)
) )
paths = sorted(paths) paths = sorted(paths)
return {"paths": paths} return {"paths": paths}
@@ -370,11 +459,13 @@ def list_paths(
@router.get("/types") @router.get("/types")
def list_types( def list_types(
include_trashed: bool = Query(default=False), include_trashed: bool = Query(default=False),
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> dict[str, list[str]]: ) -> dict[str, list[str]]:
"""Returns distinct document type values from extension, MIME, and image text type.""" """Returns distinct document type values from extension, MIME, and image text type."""
statement = select(Document.extension, Document.mime_type, Document.image_text_type) statement = select(Document.extension, Document.mime_type, Document.image_text_type)
statement = _scope_document_statement_for_auth_context(statement, auth_context)
if not include_trashed: if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED) statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).all() rows = session.execute(statement).all()
@@ -390,16 +481,20 @@ def list_types(
@router.post("/content-md/export") @router.post("/content-md/export")
def export_contents_markdown( def export_contents_markdown(
payload: ContentExportRequest, payload: ContentExportRequest,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> StreamingResponse: ) -> StreamingResponse:
"""Exports extracted contents for selected documents as individual markdown files in a ZIP archive.""" """Exports extracted contents for selected documents as individual markdown files in a ZIP archive."""
_enforce_content_export_rate_limit(auth_context)
has_document_ids = len(payload.document_ids) > 0 has_document_ids = len(payload.document_ids) > 0
has_path_prefix = bool(payload.path_prefix and payload.path_prefix.strip()) has_path_prefix = bool(payload.path_prefix and payload.path_prefix.strip())
if not has_document_ids and not has_path_prefix: if not has_document_ids and not has_path_prefix:
raise HTTPException(status_code=400, detail="Provide document_ids or path_prefix for export") raise HTTPException(status_code=400, detail="Provide document_ids or path_prefix for export")
statement = select(Document) statement = select(Document)
statement = _scope_document_statement_for_auth_context(statement, auth_context)
if has_document_ids: if has_document_ids:
statement = statement.where(Document.id.in_(payload.document_ids)) statement = statement.where(Document.id.in_(payload.document_ids))
if has_path_prefix: if has_path_prefix:
@@ -409,37 +504,82 @@ def export_contents_markdown(
elif not payload.include_trashed: elif not payload.include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED) statement = statement.where(Document.status != DocumentStatus.TRASHED)
documents = session.execute(statement.order_by(Document.logical_path.asc(), Document.created_at.asc())).scalars().all() max_documents = max(1, int(settings.content_export_max_documents))
ordered_statement = statement.order_by(Document.logical_path.asc(), Document.created_at.asc()).limit(max_documents + 1)
documents = session.execute(ordered_statement).scalars().all()
if len(documents) > max_documents:
raise HTTPException(
status_code=413,
detail=f"Export exceeds maximum document count ({len(documents)} > {max_documents})",
)
if not documents: if not documents:
raise HTTPException(status_code=404, detail="No matching documents found for export") raise HTTPException(status_code=404, detail="No matching documents found for export")
archive_buffer = io.BytesIO() max_total_bytes = max(1, int(settings.content_export_max_total_bytes))
max_spool_memory = max(64 * 1024, int(settings.content_export_spool_max_memory_bytes))
archive_file = tempfile.SpooledTemporaryFile(max_size=max_spool_memory, mode="w+b")
total_export_bytes = 0
used_entries: set[str] = set() used_entries: set[str] = set()
with zipfile.ZipFile(archive_buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive: try:
with zipfile.ZipFile(archive_file, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
for document in documents: for document in documents:
markdown_bytes = _markdown_for_document(document).encode("utf-8")
total_export_bytes += len(markdown_bytes)
if total_export_bytes > max_total_bytes:
raise HTTPException(
status_code=413,
detail=(
"Export exceeds total markdown size limit "
f"({total_export_bytes} > {max_total_bytes} bytes)"
),
)
entry_name = _zip_entry_name(document, used_entries) entry_name = _zip_entry_name(document, used_entries)
archive.writestr(entry_name, _markdown_for_document(document)) archive.writestr(entry_name, markdown_bytes)
archive_file.seek(0)
except Exception:
archive_file.close()
raise
archive_buffer.seek(0) chunk_bytes = max(4 * 1024, int(settings.content_export_stream_chunk_bytes))
headers = {"Content-Disposition": 'attachment; filename="document-contents-md.zip"'} headers = {"Content-Disposition": 'attachment; filename="document-contents-md.zip"'}
return StreamingResponse(archive_buffer, media_type="application/zip", headers=headers) return StreamingResponse(
_stream_binary_file_chunks(archive_file, chunk_bytes=chunk_bytes),
media_type="application/zip",
headers=headers,
)
@router.get("/{document_id}", response_model=DocumentDetailResponse) @router.get("/{document_id}", response_model=DocumentDetailResponse)
def get_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentDetailResponse: def get_document(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> DocumentDetailResponse:
"""Returns one document by unique identifier.""" """Returns one document by unique identifier."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
return DocumentDetailResponse.model_validate(document) return DocumentDetailResponse.model_validate(document)
@router.get("/{document_id}/download") @router.get("/{document_id}/download")
def download_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse: def download_document(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> FileResponse:
"""Downloads original document bytes for the requested document identifier.""" """Downloads original document bytes for the requested document identifier."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
file_path = absolute_path(document.stored_relative_path) file_path = absolute_path(document.stored_relative_path)
@@ -447,22 +587,46 @@ def download_document(document_id: UUID, session: Session = Depends(get_session)
@router.get("/{document_id}/preview") @router.get("/{document_id}/preview")
def preview_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse: def preview_document(
"""Streams the original document inline when browser rendering is supported.""" document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> FileResponse:
"""Streams trusted-safe MIME types inline and forces attachment for active script-capable types."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
original_path = absolute_path(document.stored_relative_path) original_path = absolute_path(document.stored_relative_path)
return FileResponse(path=original_path, media_type=document.mime_type) common_headers = {"X-Content-Type-Options": "nosniff"}
if not is_inline_preview_mime_type_safe(document.mime_type):
return FileResponse(
path=original_path,
filename=document.original_filename,
media_type="application/octet-stream",
headers=common_headers,
)
return FileResponse(path=original_path, media_type=document.mime_type, headers=common_headers)
@router.get("/{document_id}/thumbnail") @router.get("/{document_id}/thumbnail")
def thumbnail_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse: def thumbnail_document(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> FileResponse:
"""Returns a generated thumbnail image for dashboard card previews.""" """Returns a generated thumbnail image for dashboard card previews."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
@@ -477,10 +641,18 @@ def thumbnail_document(document_id: UUID, session: Session = Depends(get_session
@router.get("/{document_id}/content-md") @router.get("/{document_id}/content-md")
def download_document_content_markdown(document_id: UUID, session: Session = Depends(get_session)) -> Response: def download_document_content_markdown(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> Response:
"""Downloads extracted content for one document as a markdown file.""" """Downloads extracted content for one document as a markdown file."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
@@ -497,6 +669,7 @@ async def upload_documents(
logical_path: Annotated[str, Form()] = "Inbox", logical_path: Annotated[str, Form()] = "Inbox",
tags: Annotated[str | None, Form()] = None, tags: Annotated[str | None, Form()] = None,
conflict_mode: Annotated[Literal["ask", "replace", "duplicate"], Form()] = "ask", conflict_mode: Annotated[Literal["ask", "replace", "duplicate"], Form()] = "ask",
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> UploadResponse: ) -> UploadResponse:
"""Uploads files, records metadata, and enqueues asynchronous extraction tasks.""" """Uploads files, records metadata, and enqueues asynchronous extraction tasks."""
@@ -554,7 +727,11 @@ async def upload_documents(
} }
) )
existing = session.execute(select(Document).where(Document.sha256 == sha256)).scalar_one_or_none() existing_statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.sha256 == sha256),
auth_context,
)
existing = session.execute(existing_statement).scalar_one_or_none()
if existing and conflict_mode == "ask": if existing and conflict_mode == "ask":
log_processing_event( log_processing_event(
session=session, session=session,
@@ -581,9 +758,11 @@ async def upload_documents(
return UploadResponse(uploaded=[], conflicts=conflicts) return UploadResponse(uploaded=[], conflicts=conflicts)
for prepared in prepared_uploads: for prepared in prepared_uploads:
existing = session.execute( existing_statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.sha256 == str(prepared["sha256"])) select(Document).where(Document.sha256 == str(prepared["sha256"])),
).scalar_one_or_none() auth_context,
)
existing = session.execute(existing_statement).scalar_one_or_none()
replaces_document_id = existing.id if existing and conflict_mode == "replace" else None replaces_document_id = existing.id if existing and conflict_mode == "replace" else None
stored_relative_path = store_bytes(str(prepared["filename"]), bytes(prepared["data"])) stored_relative_path = store_bytes(str(prepared["filename"]), bytes(prepared["data"]))
@@ -598,6 +777,7 @@ async def upload_documents(
size_bytes=len(bytes(prepared["data"])), size_bytes=len(bytes(prepared["data"])),
logical_path=logical_path, logical_path=logical_path,
tags=list(normalized_tags), tags=list(normalized_tags),
owner_user_id=auth_context.user_id,
replaces_document_id=replaces_document_id, replaces_document_id=replaces_document_id,
metadata_json={"upload": "web"}, metadata_json={"upload": "web"},
) )
@@ -629,11 +809,16 @@ async def upload_documents(
def update_document( def update_document(
document_id: UUID, document_id: UUID,
payload: DocumentUpdateRequest, payload: DocumentUpdateRequest,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> DocumentResponse: ) -> DocumentResponse:
"""Updates document metadata and refreshes semantic index representation.""" """Updates document metadata and refreshes semantic index representation."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
@@ -655,10 +840,18 @@ def update_document(
@router.post("/{document_id}/trash", response_model=DocumentResponse) @router.post("/{document_id}/trash", response_model=DocumentResponse)
def trash_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse: def trash_document(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> DocumentResponse:
"""Marks a document as trashed without deleting files from storage.""" """Marks a document as trashed without deleting files from storage."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
@@ -679,10 +872,18 @@ def trash_document(document_id: UUID, session: Session = Depends(get_session)) -
@router.post("/{document_id}/restore", response_model=DocumentResponse) @router.post("/{document_id}/restore", response_model=DocumentResponse)
def restore_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse: def restore_document(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> DocumentResponse:
"""Restores a trashed document to its previous lifecycle status.""" """Restores a trashed document to its previous lifecycle status."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
@@ -704,16 +905,27 @@ def restore_document(document_id: UUID, session: Session = Depends(get_session))
@router.delete("/{document_id}") @router.delete("/{document_id}")
def delete_document(document_id: UUID, session: Session = Depends(get_session)) -> dict[str, int]: def delete_document(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> dict[str, int]:
"""Permanently deletes a document and all descendant archive members including stored files.""" """Permanently deletes a document and all descendant archive members including stored files."""
root = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() root_statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
root = session.execute(root_statement).scalar_one_or_none()
if root is None: if root is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
if root.status != DocumentStatus.TRASHED: if root.status != DocumentStatus.TRASHED:
raise HTTPException(status_code=400, detail="Move document to trash before permanent deletion") raise HTTPException(status_code=400, detail="Move document to trash before permanent deletion")
document_tree = _collect_document_tree(session=session, root_document_id=document_id) document_tree = _collect_document_tree(session=session, root_document_id=document_id)
if auth_context.role != UserRole.ADMIN:
for _, document in document_tree:
_ensure_document_access(document, auth_context)
document_ids = [document.id for _, document in document_tree] document_ids = [document.id for _, document in document_tree]
try: try:
delete_many_documents_index([str(current_id) for current_id in document_ids]) delete_many_documents_index([str(current_id) for current_id in document_ids])
@@ -744,10 +956,18 @@ def delete_document(document_id: UUID, session: Session = Depends(get_session))
@router.post("/{document_id}/reprocess", response_model=DocumentResponse) @router.post("/{document_id}/reprocess", response_model=DocumentResponse)
def reprocess_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse: def reprocess_document(
document_id: UUID,
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session),
) -> DocumentResponse:
"""Re-enqueues a document for extraction and suggestion processing.""" """Re-enqueues a document for extraction and suggestion processing."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() statement = _scope_document_statement_for_auth_context(
select(Document).where(Document.id == document_id),
auth_context,
)
document = session.execute(statement).scalar_one_or_none()
if document is None: if document is None:
raise HTTPException(status_code=404, detail="Document not found") raise HTTPException(status_code=404, detail="Document not found")
if document.status == DocumentStatus.TRASHED: if document.status == DocumentStatus.TRASHED:

View File

@@ -4,7 +4,8 @@ from fastapi import APIRouter, Depends, Query
from sqlalchemy import Text, cast, func, select from sqlalchemy import Text, cast, func, select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.api.routes_documents import _apply_discovery_filters from app.api.auth import AuthContext, require_user_or_admin
from app.api.routes_documents import _apply_discovery_filters, _scope_document_statement_for_auth_context
from app.db.base import get_session from app.db.base import get_session
from app.models.document import Document, DocumentStatus from app.models.document import Document, DocumentStatus
from app.schemas.documents import DocumentResponse, SearchResponse from app.schemas.documents import DocumentResponse, SearchResponse
@@ -25,6 +26,7 @@ def search_documents(
type_filter: str | None = Query(default=None), type_filter: str | None = Query(default=None),
processed_from: str | None = Query(default=None), processed_from: str | None = Query(default=None),
processed_to: str | None = Query(default=None), processed_to: str | None = Query(default=None),
auth_context: AuthContext = Depends(require_user_or_admin),
session: Session = Depends(get_session), session: Session = Depends(get_session),
) -> SearchResponse: ) -> SearchResponse:
"""Searches documents using PostgreSQL full-text ranking plus metadata matching.""" """Searches documents using PostgreSQL full-text ranking plus metadata matching."""
@@ -50,6 +52,7 @@ def search_documents(
) )
statement = select(Document).where(search_filter) statement = select(Document).where(search_filter)
statement = _scope_document_statement_for_auth_context(statement, auth_context)
if only_trashed: if only_trashed:
statement = statement.where(Document.status == DocumentStatus.TRASHED) statement = statement.where(Document.status == DocumentStatus.TRASHED)
elif not include_trashed: elif not include_trashed:
@@ -67,6 +70,7 @@ def search_documents(
items = session.execute(statement).scalars().all() items = session.execute(statement).scalars().all()
count_statement = select(func.count(Document.id)).where(search_filter) count_statement = select(func.count(Document.id)).where(search_filter)
count_statement = _scope_document_statement_for_auth_context(count_statement, auth_context)
if only_trashed: if only_trashed:
count_statement = count_statement.where(Document.status == DocumentStatus.TRASHED) count_statement = count_statement.where(Document.status == DocumentStatus.TRASHED)
elif not include_trashed: elif not include_trashed:

View File

@@ -19,19 +19,39 @@ class Settings(BaseSettings):
app_env: str = "development" app_env: str = "development"
database_url: str = "postgresql+psycopg://dcm:dcm@db:5432/dcm" database_url: str = "postgresql+psycopg://dcm:dcm@db:5432/dcm"
redis_url: str = "redis://redis:6379/0" redis_url: str = "redis://redis:6379/0"
redis_security_mode: str = "auto"
redis_tls_mode: str = "auto"
auth_bootstrap_admin_username: str = "admin"
auth_bootstrap_admin_password: str = ""
auth_bootstrap_user_username: str = ""
auth_bootstrap_user_password: str = ""
auth_session_ttl_minutes: int = 720
auth_password_pbkdf2_iterations: int = 390000
auth_session_token_bytes: int = 32
auth_session_pepper: str = ""
auth_login_failure_limit: int = 5
auth_login_failure_window_seconds: int = 900
auth_login_lockout_base_seconds: int = 30
auth_login_lockout_max_seconds: int = 900
auth_cookie_domain: str = ""
auth_cookie_samesite: str = "auto"
storage_root: Path = Path("/data/storage") storage_root: Path = Path("/data/storage")
upload_chunk_size: int = 4 * 1024 * 1024 upload_chunk_size: int = 4 * 1024 * 1024
max_upload_files_per_request: int = 50 max_upload_files_per_request: int = 50
max_upload_file_size_bytes: int = 25 * 1024 * 1024 max_upload_file_size_bytes: int = 25 * 1024 * 1024
max_upload_request_size_bytes: int = 100 * 1024 * 1024 max_upload_request_size_bytes: int = 100 * 1024 * 1024
content_export_max_documents: int = 250
content_export_max_total_bytes: int = 50 * 1024 * 1024
content_export_rate_limit_per_minute: int = 6
content_export_stream_chunk_bytes: int = 256 * 1024
content_export_spool_max_memory_bytes: int = 2 * 1024 * 1024
max_zip_members: int = 250 max_zip_members: int = 250
max_zip_depth: int = 2 max_zip_depth: int = 2
max_zip_descendants_per_root: int = 1000
max_zip_member_uncompressed_bytes: int = 25 * 1024 * 1024 max_zip_member_uncompressed_bytes: int = 25 * 1024 * 1024
max_zip_total_uncompressed_bytes: int = 150 * 1024 * 1024 max_zip_total_uncompressed_bytes: int = 150 * 1024 * 1024
max_zip_compression_ratio: float = 120.0 max_zip_compression_ratio: float = 120.0
max_text_length: int = 500_000 max_text_length: int = 500_000
admin_api_token: str = ""
user_api_token: str = ""
provider_base_url_allowlist: list[str] = Field(default_factory=lambda: ["api.openai.com"]) provider_base_url_allowlist: list[str] = Field(default_factory=lambda: ["api.openai.com"])
provider_base_url_allow_http: bool = False provider_base_url_allow_http: bool = False
provider_base_url_allow_private_network: bool = False provider_base_url_allow_private_network: bool = False
@@ -39,17 +59,20 @@ class Settings(BaseSettings):
processing_log_max_unbound_entries: int = 400 processing_log_max_unbound_entries: int = 400
processing_log_max_payload_chars: int = 4096 processing_log_max_payload_chars: int = 4096
processing_log_max_text_chars: int = 12000 processing_log_max_text_chars: int = 12000
processing_log_store_model_io_text: bool = False
processing_log_store_payload_text: bool = False
default_openai_base_url: str = "https://api.openai.com/v1" default_openai_base_url: str = "https://api.openai.com/v1"
default_openai_model: str = "gpt-4.1-mini" default_openai_model: str = "gpt-4.1-mini"
default_openai_timeout_seconds: int = 45 default_openai_timeout_seconds: int = 45
default_openai_handwriting_enabled: bool = True default_openai_handwriting_enabled: bool = True
default_openai_api_key: str = "" default_openai_api_key: str = ""
app_settings_encryption_key: str = ""
default_summary_model: str = "gpt-4.1-mini" default_summary_model: str = "gpt-4.1-mini"
default_routing_model: str = "gpt-4.1-mini" default_routing_model: str = "gpt-4.1-mini"
typesense_protocol: str = "http" typesense_protocol: str = "http"
typesense_host: str = "typesense" typesense_host: str = "typesense"
typesense_port: int = 8108 typesense_port: int = 8108
typesense_api_key: str = "dcm-typesense-key" typesense_api_key: str = ""
typesense_collection_name: str = "documents" typesense_collection_name: str = "documents"
typesense_timeout_seconds: int = 120 typesense_timeout_seconds: int = 120
typesense_num_retries: int = 0 typesense_num_retries: int = 0
@@ -58,6 +81,111 @@ class Settings(BaseSettings):
LOCAL_HOSTNAME_SUFFIXES = (".local", ".internal", ".home.arpa") LOCAL_HOSTNAME_SUFFIXES = (".local", ".internal", ".home.arpa")
SCRIPT_CAPABLE_INLINE_MIME_TYPES = frozenset(
{
"application/ecmascript",
"application/javascript",
"application/x-javascript",
"application/xhtml+xml",
"image/svg+xml",
"text/ecmascript",
"text/html",
"text/javascript",
}
)
SCRIPT_CAPABLE_XML_MIME_TYPES = frozenset({"application/xml", "text/xml"})
REDIS_SECURITY_MODES = frozenset({"auto", "strict", "compat"})
REDIS_TLS_MODES = frozenset({"auto", "required", "allow_insecure"})
def _is_production_environment(app_env: str) -> bool:
"""Returns whether the runtime environment should enforce production-only security gates."""
normalized = app_env.strip().lower()
return normalized in {"production", "prod"}
def _normalize_redis_security_mode(raw_mode: str) -> str:
"""Normalizes Redis security mode values into one supported mode."""
normalized = raw_mode.strip().lower()
if normalized not in REDIS_SECURITY_MODES:
return "auto"
return normalized
def _normalize_redis_tls_mode(raw_mode: str) -> str:
"""Normalizes Redis TLS mode values into one supported mode."""
normalized = raw_mode.strip().lower()
if normalized not in REDIS_TLS_MODES:
return "auto"
return normalized
def validate_redis_url_security(
redis_url: str,
*,
app_env: str | None = None,
security_mode: str | None = None,
tls_mode: str | None = None,
) -> str:
"""Validates Redis URL security posture with production fail-closed defaults."""
settings = get_settings()
resolved_app_env = app_env if app_env is not None else settings.app_env
resolved_security_mode = (
_normalize_redis_security_mode(security_mode)
if security_mode is not None
else _normalize_redis_security_mode(settings.redis_security_mode)
)
resolved_tls_mode = (
_normalize_redis_tls_mode(tls_mode)
if tls_mode is not None
else _normalize_redis_tls_mode(settings.redis_tls_mode)
)
candidate = redis_url.strip()
if not candidate:
raise ValueError("Redis URL must not be empty")
parsed = urlparse(candidate)
scheme = parsed.scheme.lower()
if scheme not in {"redis", "rediss"}:
raise ValueError("Redis URL must use redis:// or rediss://")
if not parsed.hostname:
raise ValueError("Redis URL must include a hostname")
strict_security = (
resolved_security_mode == "strict"
or (resolved_security_mode == "auto" and _is_production_environment(resolved_app_env))
)
require_tls = (
resolved_tls_mode == "required"
or (resolved_tls_mode == "auto" and strict_security)
)
has_password = bool(parsed.password and parsed.password.strip())
uses_tls = scheme == "rediss"
if strict_security and not has_password:
raise ValueError("Redis URL must include authentication when security mode is strict")
if require_tls and not uses_tls:
raise ValueError("Redis URL must use rediss:// when TLS is required")
return candidate
def is_inline_preview_mime_type_safe(mime_type: str) -> bool:
"""Returns whether a MIME type is safe to serve inline from untrusted document uploads."""
normalized = mime_type.split(";", 1)[0].strip().lower() if mime_type else ""
if not normalized:
return False
if normalized in SCRIPT_CAPABLE_INLINE_MIME_TYPES:
return False
if normalized in SCRIPT_CAPABLE_XML_MIME_TYPES or normalized.endswith("+xml"):
return False
return True
def _normalize_allowlist(allowlist: object) -> tuple[str, ...]: def _normalize_allowlist(allowlist: object) -> tuple[str, ...]:

View File

@@ -10,6 +10,7 @@ from app.api.router import api_router
from app.core.config import get_settings from app.core.config import get_settings
from app.db.base import init_db from app.db.base import init_db
from app.services.app_settings import ensure_app_settings from app.services.app_settings import ensure_app_settings
from app.services.authentication import ensure_bootstrap_users
from app.services.handwriting_style import ensure_handwriting_style_collection from app.services.handwriting_style import ensure_handwriting_style_collection
from app.services.storage import ensure_storage from app.services.storage import ensure_storage
from app.services.typesense_index import ensure_typesense_collection from app.services.typesense_index import ensure_typesense_collection
@@ -34,9 +35,10 @@ def create_app() -> FastAPI:
"""Builds and configures the FastAPI application instance.""" """Builds and configures the FastAPI application instance."""
app = FastAPI(title="DCM DMS API", version="0.1.0") app = FastAPI(title="DCM DMS API", version="0.1.0")
allowed_origins = [origin.strip() for origin in settings.cors_origins if isinstance(origin, str) and origin.strip()]
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
allow_origins=settings.cors_origins, allow_origins=allowed_origins,
allow_credentials=True, allow_credentials=True,
allow_methods=["*"], allow_methods=["*"],
allow_headers=["*"], allow_headers=["*"],
@@ -80,8 +82,9 @@ def create_app() -> FastAPI:
"""Initializes storage directories and database schema on service startup.""" """Initializes storage directories and database schema on service startup."""
ensure_storage() ensure_storage()
ensure_app_settings()
init_db() init_db()
ensure_bootstrap_users()
ensure_app_settings()
try: try:
ensure_typesense_collection() ensure_typesense_collection()
except Exception: except Exception:

View File

@@ -1,6 +1,7 @@
"""Model exports for ORM metadata discovery.""" """Model exports for ORM metadata discovery."""
from app.models.auth import AppUser, AuthSession, UserRole
from app.models.document import Document, DocumentStatus from app.models.document import Document, DocumentStatus
from app.models.processing_log import ProcessingLogEntry from app.models.processing_log import ProcessingLogEntry
__all__ = ["Document", "DocumentStatus", "ProcessingLogEntry"] __all__ = ["AppUser", "AuthSession", "Document", "DocumentStatus", "ProcessingLogEntry", "UserRole"]

View File

@@ -0,0 +1,66 @@
"""Data models for authenticated users and issued API sessions."""
import uuid
from datetime import UTC, datetime
from enum import Enum
from sqlalchemy import Boolean, DateTime, Enum as SqlEnum, ForeignKey, String
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
class UserRole(str, Enum):
"""Declares authorization roles used for API route access control."""
ADMIN = "admin"
USER = "user"
class AppUser(Base):
"""Stores one authenticatable user account with role-bound authorization."""
__tablename__ = "app_users"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
username: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, index=True)
password_hash: Mapped[str] = mapped_column(String(512), nullable=False)
role: Mapped[UserRole] = mapped_column(SqlEnum(UserRole), nullable=False, default=UserRole.USER)
is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, default=lambda: datetime.now(UTC))
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(UTC),
onupdate=lambda: datetime.now(UTC),
)
sessions: Mapped[list["AuthSession"]] = relationship(
"AuthSession",
back_populates="user",
cascade="all, delete-orphan",
)
class AuthSession(Base):
"""Stores one issued bearer session token for a specific authenticated user."""
__tablename__ = "auth_sessions"
id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
user_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), ForeignKey("app_users.id", ondelete="CASCADE"), nullable=False, index=True)
token_hash: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, index=True)
expires_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, index=True)
revoked_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
user_agent: Mapped[str | None] = mapped_column(String(512), nullable=True)
ip_address: Mapped[str | None] = mapped_column(String(64), nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, default=lambda: datetime.now(UTC))
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
nullable=False,
default=lambda: datetime.now(UTC),
onupdate=lambda: datetime.now(UTC),
)
user: Mapped[AppUser] = relationship("AppUser", back_populates="sessions")

View File

@@ -38,6 +38,12 @@ class Document(Base):
suggested_path: Mapped[str | None] = mapped_column(String(1024), nullable=True) suggested_path: Mapped[str | None] = mapped_column(String(1024), nullable=True)
tags: Mapped[list[str]] = mapped_column(ARRAY(String), nullable=False, default=list) tags: Mapped[list[str]] = mapped_column(ARRAY(String), nullable=False, default=list)
suggested_tags: Mapped[list[str]] = mapped_column(ARRAY(String), nullable=False, default=list) suggested_tags: Mapped[list[str]] = mapped_column(ARRAY(String), nullable=False, default=list)
owner_user_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("app_users.id", ondelete="SET NULL"),
nullable=True,
index=True,
)
metadata_json: Mapped[dict] = mapped_column(JSONB, nullable=False, default=dict) metadata_json: Mapped[dict] = mapped_column(JSONB, nullable=False, default=dict)
extracted_text: Mapped[str] = mapped_column(Text, nullable=False, default="") extracted_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
image_text_type: Mapped[str | None] = mapped_column(String(64), nullable=True) image_text_type: Mapped[str | None] = mapped_column(String(64), nullable=True)
@@ -63,3 +69,4 @@ class Document(Base):
foreign_keys=[parent_document_id], foreign_keys=[parent_document_id],
post_update=True, post_update=True,
) )
owner_user: Mapped["AppUser | None"] = relationship("AppUser", foreign_keys=[owner_user_id], post_update=True)

View File

@@ -0,0 +1,50 @@
"""Pydantic schemas for authentication and session API payloads."""
from datetime import datetime
from uuid import UUID
from pydantic import BaseModel, Field
from app.models.auth import UserRole
class AuthLoginRequest(BaseModel):
"""Represents credential input used to create one authenticated API session."""
username: str = Field(min_length=1, max_length=128)
password: str = Field(min_length=1, max_length=256)
class AuthUserResponse(BaseModel):
"""Represents one authenticated user identity and authorization role."""
id: UUID
username: str
role: UserRole
class Config:
"""Enables ORM object parsing for SQLAlchemy model instances."""
from_attributes = True
class AuthSessionResponse(BaseModel):
"""Represents active session metadata for one authenticated user."""
user: AuthUserResponse
expires_at: datetime
csrf_token: str | None = None
class AuthLoginResponse(AuthSessionResponse):
"""Represents one newly issued bearer token and associated user context."""
access_token: str | None = None
token_type: str = "bearer"
csrf_token: str | None = None
class AuthLogoutResponse(BaseModel):
"""Represents logout outcome after current session revocation attempt."""
revoked: bool

View File

@@ -1,10 +1,24 @@
"""Persistent single-user application settings service backed by host-mounted storage.""" """Persistent single-user application settings service backed by host-mounted storage."""
import base64
import binascii
import hashlib
import hmac
import json import json
import os
import re import re
import secrets
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
try:
from cryptography.fernet import Fernet, InvalidToken
except Exception: # pragma: no cover - dependency failures are surfaced at runtime usage.
Fernet = None # type: ignore[assignment]
class InvalidToken(Exception):
"""Fallback InvalidToken type used when cryptography dependency import fails."""
from app.core.config import get_settings, normalize_and_validate_provider_base_url from app.core.config import get_settings, normalize_and_validate_provider_base_url
@@ -57,6 +71,221 @@ DEFAULT_ROUTING_PROMPT = (
"Confidence must be between 0 and 1." "Confidence must be between 0 and 1."
) )
PROVIDER_API_KEY_CIPHERTEXT_PREFIX = "enc-v2"
PROVIDER_API_KEY_LEGACY_CIPHERTEXT_PREFIX = "enc-v1"
PROVIDER_API_KEY_KEYFILE_NAME = ".settings-api-key"
PROVIDER_API_KEY_LEGACY_STREAM_CONTEXT = b"dcm-provider-api-key-stream"
PROVIDER_API_KEY_LEGACY_AUTH_CONTEXT = b"dcm-provider-api-key-auth"
PROVIDER_API_KEY_LEGACY_NONCE_BYTES = 16
PROVIDER_API_KEY_LEGACY_TAG_BYTES = 32
def _settings_api_key_path() -> Path:
"""Returns the storage path used for local symmetric encryption key persistence."""
return settings.storage_root / PROVIDER_API_KEY_KEYFILE_NAME
def _write_private_text_file(path: Path, content: str) -> None:
"""Writes text files with restrictive owner-only permissions for local secret material."""
path.parent.mkdir(parents=True, exist_ok=True)
file_descriptor = os.open(str(path), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
with os.fdopen(file_descriptor, "w", encoding="utf-8") as handle:
handle.write(content)
os.chmod(path, 0o600)
def _urlsafe_b64encode_no_padding(data: bytes) -> str:
"""Encodes bytes to URL-safe base64 without padding for compact JSON persistence."""
return base64.urlsafe_b64encode(data).decode("ascii").rstrip("=")
def _urlsafe_b64decode_no_padding(data: str) -> bytes:
"""Decodes URL-safe base64 values that may omit trailing padding characters."""
padded = data + "=" * (-len(data) % 4)
return base64.urlsafe_b64decode(padded.encode("ascii"))
def _derive_provider_api_key_key() -> bytes:
"""Resolves the master key used to encrypt provider API keys for settings storage."""
configured_key = settings.app_settings_encryption_key.strip()
if configured_key:
try:
decoded = _urlsafe_b64decode_no_padding(configured_key)
if len(decoded) >= 32:
return decoded[:32]
except (binascii.Error, ValueError):
pass
return hashlib.sha256(configured_key.encode("utf-8")).digest()
key_path = _settings_api_key_path()
if key_path.exists():
try:
persisted = key_path.read_text(encoding="utf-8").strip()
decoded = _urlsafe_b64decode_no_padding(persisted)
if len(decoded) >= 32:
return decoded[:32]
except (OSError, UnicodeDecodeError, binascii.Error, ValueError):
pass
generated = secrets.token_bytes(32)
_write_private_text_file(key_path, _urlsafe_b64encode_no_padding(generated))
return generated
def _legacy_xor_bytes(left: bytes, right: bytes) -> bytes:
"""Applies byte-wise XOR for equal-length byte sequences used by legacy ciphertext migration."""
return bytes(first ^ second for first, second in zip(left, right))
def _legacy_derive_stream_cipher_bytes(master_key: bytes, nonce: bytes, length: int) -> bytes:
"""Derives legacy deterministic stream bytes from HMAC-SHA256 blocks for migration reads."""
stream = bytearray()
counter = 0
while len(stream) < length:
counter_bytes = counter.to_bytes(4, "big")
block = hmac.new(
master_key,
PROVIDER_API_KEY_LEGACY_STREAM_CONTEXT + nonce + counter_bytes,
hashlib.sha256,
).digest()
stream.extend(block)
counter += 1
return bytes(stream[:length])
def _provider_key_fernet(master_key: bytes) -> Fernet:
"""Builds Fernet instance from 32-byte symmetric key material."""
if Fernet is None:
raise AppSettingsValidationError("cryptography dependency is not available")
fernet_key = base64.urlsafe_b64encode(master_key[:32])
return Fernet(fernet_key)
def _encrypt_provider_api_key_fallback(value: str) -> str:
"""Encrypts provider keys with legacy HMAC stream construction when cryptography is unavailable."""
plaintext = value.encode("utf-8")
master_key = _derive_provider_api_key_key()
nonce = secrets.token_bytes(PROVIDER_API_KEY_LEGACY_NONCE_BYTES)
keystream = _legacy_derive_stream_cipher_bytes(master_key, nonce, len(plaintext))
ciphertext = _legacy_xor_bytes(plaintext, keystream)
tag = hmac.new(
master_key,
PROVIDER_API_KEY_LEGACY_AUTH_CONTEXT + nonce + ciphertext,
hashlib.sha256,
).digest()
payload = nonce + ciphertext + tag
encoded = _urlsafe_b64encode_no_padding(payload)
return f"{PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:{encoded}"
def _encrypt_provider_api_key(value: str) -> str:
"""Encrypts one provider API key for at-rest JSON persistence."""
normalized = value.strip()
if not normalized:
return ""
if Fernet is None:
return _encrypt_provider_api_key_fallback(normalized)
master_key = _derive_provider_api_key_key()
token = _provider_key_fernet(master_key).encrypt(normalized.encode("utf-8")).decode("ascii")
return f"{PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:{token}"
def _decrypt_provider_api_key_legacy_payload(encoded_payload: str) -> str:
"""Decrypts legacy stream-cipher payload bytes used for migration and fallback reads."""
if not encoded_payload:
raise AppSettingsValidationError("Provider API key ciphertext is missing payload bytes")
try:
payload = _urlsafe_b64decode_no_padding(encoded_payload)
except (binascii.Error, ValueError) as error:
raise AppSettingsValidationError("Provider API key ciphertext is not valid base64") from error
minimum_length = PROVIDER_API_KEY_LEGACY_NONCE_BYTES + PROVIDER_API_KEY_LEGACY_TAG_BYTES
if len(payload) < minimum_length:
raise AppSettingsValidationError("Provider API key ciphertext payload is truncated")
nonce = payload[:PROVIDER_API_KEY_LEGACY_NONCE_BYTES]
ciphertext = payload[PROVIDER_API_KEY_LEGACY_NONCE_BYTES:-PROVIDER_API_KEY_LEGACY_TAG_BYTES]
received_tag = payload[-PROVIDER_API_KEY_LEGACY_TAG_BYTES:]
master_key = _derive_provider_api_key_key()
expected_tag = hmac.new(
master_key,
PROVIDER_API_KEY_LEGACY_AUTH_CONTEXT + nonce + ciphertext,
hashlib.sha256,
).digest()
if not hmac.compare_digest(received_tag, expected_tag):
raise AppSettingsValidationError("Provider API key ciphertext integrity check failed")
keystream = _legacy_derive_stream_cipher_bytes(master_key, nonce, len(ciphertext))
plaintext = _legacy_xor_bytes(ciphertext, keystream)
try:
return plaintext.decode("utf-8").strip()
except UnicodeDecodeError as error:
raise AppSettingsValidationError("Provider API key ciphertext is not valid UTF-8") from error
def _decrypt_provider_api_key_legacy(value: str) -> str:
"""Decrypts legacy `enc-v1` payloads to support non-breaking key migration."""
encoded_payload = value.split(":", 1)[1]
return _decrypt_provider_api_key_legacy_payload(encoded_payload)
def _decrypt_provider_api_key(value: str) -> str:
"""Decrypts provider API key ciphertext while rejecting tampered payloads."""
normalized = value.strip()
if not normalized:
return ""
if not normalized.startswith(f"{PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:") and not normalized.startswith(
f"{PROVIDER_API_KEY_LEGACY_CIPHERTEXT_PREFIX}:"
):
return normalized
if normalized.startswith(f"{PROVIDER_API_KEY_LEGACY_CIPHERTEXT_PREFIX}:"):
return _decrypt_provider_api_key_legacy(normalized)
token = normalized.split(":", 1)[1].strip()
if not token:
raise AppSettingsValidationError("Provider API key ciphertext is missing payload bytes")
if Fernet is None:
return _decrypt_provider_api_key_legacy_payload(token)
try:
plaintext = _provider_key_fernet(_derive_provider_api_key_key()).decrypt(token.encode("ascii"))
except (InvalidToken, ValueError, UnicodeEncodeError) as error:
raise AppSettingsValidationError("Provider API key ciphertext integrity check failed") from error
try:
return plaintext.decode("utf-8").strip()
except UnicodeDecodeError as error:
raise AppSettingsValidationError("Provider API key ciphertext is not valid UTF-8") from error
def _read_provider_api_key(provider_payload: dict[str, Any]) -> str:
"""Reads provider API key values from encrypted or legacy plaintext settings payloads."""
encrypted_value = provider_payload.get("api_key_encrypted")
if isinstance(encrypted_value, str) and encrypted_value.strip():
try:
return _decrypt_provider_api_key(encrypted_value)
except AppSettingsValidationError:
return ""
plaintext_value = provider_payload.get("api_key")
if plaintext_value is None:
return ""
return str(plaintext_value).strip()
def _default_settings() -> dict[str, Any]: def _default_settings() -> dict[str, Any]:
"""Builds default settings including providers and model task bindings.""" """Builds default settings including providers and model task bindings."""
@@ -243,8 +472,17 @@ def _normalize_provider(
if provider_type != "openai_compatible": if provider_type != "openai_compatible":
provider_type = "openai_compatible" provider_type = "openai_compatible"
api_key_value = payload.get("api_key", fallback_values.get("api_key", defaults["api_key"])) payload_api_key = _read_provider_api_key(payload)
api_key = str(api_key_value).strip() if api_key_value is not None else "" fallback_api_key = _read_provider_api_key(fallback_values)
default_api_key = _read_provider_api_key(defaults)
if "api_key" in payload and payload.get("api_key") is not None:
api_key = str(payload.get("api_key")).strip()
elif payload_api_key:
api_key = payload_api_key
elif fallback_api_key:
api_key = fallback_api_key
else:
api_key = default_api_key
raw_base_url = str(payload.get("base_url", fallback_values.get("base_url", defaults["base_url"]))).strip() raw_base_url = str(payload.get("base_url", fallback_values.get("base_url", defaults["base_url"]))).strip()
if not raw_base_url: if not raw_base_url:
@@ -266,6 +504,7 @@ def _normalize_provider(
) )
), ),
"api_key": api_key, "api_key": api_key,
"api_key_encrypted": _encrypt_provider_api_key(api_key),
} }
@@ -653,6 +892,26 @@ def _sanitize_settings(payload: dict[str, Any]) -> dict[str, Any]:
} }
def _serialize_settings_for_storage(payload: dict[str, Any]) -> dict[str, Any]:
"""Converts sanitized runtime payload into storage-safe form without plaintext provider keys."""
storage_payload = dict(payload)
providers_storage: list[dict[str, Any]] = []
for provider in payload.get("providers", []):
if not isinstance(provider, dict):
continue
provider_storage = dict(provider)
plaintext_api_key = str(provider_storage.pop("api_key", "")).strip()
encrypted_api_key = str(provider_storage.get("api_key_encrypted", "")).strip()
if plaintext_api_key:
encrypted_api_key = _encrypt_provider_api_key(plaintext_api_key)
provider_storage["api_key_encrypted"] = encrypted_api_key
providers_storage.append(provider_storage)
storage_payload["providers"] = providers_storage
return storage_payload
def ensure_app_settings() -> None: def ensure_app_settings() -> None:
"""Creates a settings file with defaults when no persisted settings are present.""" """Creates a settings file with defaults when no persisted settings are present."""
@@ -662,7 +921,7 @@ def ensure_app_settings() -> None:
return return
defaults = _sanitize_settings(_default_settings()) defaults = _sanitize_settings(_default_settings())
path.write_text(json.dumps(defaults, indent=2), encoding="utf-8") _write_private_text_file(path, json.dumps(_serialize_settings_for_storage(defaults), indent=2))
def _read_raw_settings() -> dict[str, Any]: def _read_raw_settings() -> dict[str, Any]:
@@ -682,7 +941,8 @@ def _write_settings(payload: dict[str, Any]) -> None:
path = _settings_path() path = _settings_path()
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2), encoding="utf-8") storage_payload = _serialize_settings_for_storage(payload)
_write_private_text_file(path, json.dumps(storage_payload, indent=2))
def read_app_settings() -> dict[str, Any]: def read_app_settings() -> dict[str, Any]:
@@ -879,16 +1139,21 @@ def update_app_settings(
def read_handwriting_provider_settings() -> dict[str, Any]: def read_handwriting_provider_settings() -> dict[str, Any]:
"""Returns OCR settings in legacy shape for the handwriting transcription service.""" """Returns OCR settings in legacy shape with DNS-revalidated provider base URL safety checks."""
runtime = read_task_runtime_settings(TASK_OCR_HANDWRITING) runtime = read_task_runtime_settings(TASK_OCR_HANDWRITING)
provider = runtime["provider"] provider = runtime["provider"]
task = runtime["task"] task = runtime["task"]
raw_base_url = str(provider.get("base_url", settings.default_openai_base_url))
try:
normalized_base_url = normalize_and_validate_provider_base_url(raw_base_url, resolve_dns=True)
except ValueError as error:
raise AppSettingsValidationError(str(error)) from error
return { return {
"provider": provider["provider_type"], "provider": provider["provider_type"],
"enabled": bool(task.get("enabled", True)), "enabled": bool(task.get("enabled", True)),
"openai_base_url": str(provider.get("base_url", settings.default_openai_base_url)), "openai_base_url": normalized_base_url,
"openai_model": str(task.get("model", settings.default_openai_model)), "openai_model": str(task.get("model", settings.default_openai_model)),
"openai_timeout_seconds": int(provider.get("timeout_seconds", settings.default_openai_timeout_seconds)), "openai_timeout_seconds": int(provider.get("timeout_seconds", settings.default_openai_timeout_seconds)),
"openai_api_key": str(provider.get("api_key", "")), "openai_api_key": str(provider.get("api_key", "")),

View File

@@ -0,0 +1,187 @@
"""Redis-backed brute-force protections for authentication login requests."""
from __future__ import annotations
from dataclasses import dataclass
import logging
from redis.exceptions import RedisError
from app.core.config import Settings, get_settings
from app.services.authentication import normalize_username
from app.worker.queue import get_redis
logger = logging.getLogger(__name__)
USERNAME_SUBJECT_KIND = "username"
IP_SUBJECT_KIND = "ip"
UNKNOWN_USERNAME_SUBJECT = "unknown-username"
UNKNOWN_IP_SUBJECT = "unknown-ip"
@dataclass(frozen=True)
class LoginThrottlePolicy:
"""Captures login throttle policy values resolved from runtime settings."""
failure_limit: int
failure_window_seconds: int
lockout_base_seconds: int
lockout_max_seconds: int
@dataclass(frozen=True)
class LoginThrottleStatus:
"""Represents whether login attempts are currently throttled and retry metadata."""
is_throttled: bool
retry_after_seconds: int = 0
def _bounded_int(value: int, *, minimum: int, maximum: int) -> int:
"""Clamps one integer value to an inclusive minimum and maximum range."""
return max(minimum, min(maximum, int(value)))
def _resolve_policy(settings: Settings) -> LoginThrottlePolicy:
"""Resolves login throttle policy from settings with defensive value bounds."""
failure_limit = _bounded_int(settings.auth_login_failure_limit, minimum=1, maximum=1000)
failure_window_seconds = _bounded_int(settings.auth_login_failure_window_seconds, minimum=30, maximum=86400)
lockout_base_seconds = _bounded_int(settings.auth_login_lockout_base_seconds, minimum=1, maximum=3600)
lockout_max_seconds = _bounded_int(settings.auth_login_lockout_max_seconds, minimum=1, maximum=86400)
if lockout_max_seconds < lockout_base_seconds:
lockout_max_seconds = lockout_base_seconds
return LoginThrottlePolicy(
failure_limit=failure_limit,
failure_window_seconds=failure_window_seconds,
lockout_base_seconds=lockout_base_seconds,
lockout_max_seconds=lockout_max_seconds,
)
def _normalize_login_identity(username: str, ip_address: str | None) -> tuple[str, str]:
"""Normalizes username and source IP identity values used by throttle storage keys."""
normalized_username = normalize_username(username) or UNKNOWN_USERNAME_SUBJECT
normalized_ip = (ip_address or "").strip()[:64] or UNKNOWN_IP_SUBJECT
return normalized_username, normalized_ip
def _identity_subjects(username: str, ip_address: str | None) -> tuple[tuple[str, str], tuple[str, str]]:
"""Builds the username and IP throttle subject tuples for one login attempt."""
normalized_username, normalized_ip = _normalize_login_identity(username, ip_address)
return (
(USERNAME_SUBJECT_KIND, normalized_username),
(IP_SUBJECT_KIND, normalized_ip),
)
def _failure_key(*, subject_kind: str, subject_value: str) -> str:
"""Builds the Redis key used to track failed login counts for one subject."""
return f"dcm:auth-login:fail:{subject_kind}:{subject_value}"
def _lock_key(*, subject_kind: str, subject_value: str) -> str:
"""Builds the Redis key used to store active lockout state for one subject."""
return f"dcm:auth-login:lock:{subject_kind}:{subject_value}"
def _next_lockout_seconds(*, failure_count: int, policy: LoginThrottlePolicy) -> int:
"""Computes exponential lockout duration when failed attempts exceed configured limit."""
if failure_count <= policy.failure_limit:
return 0
additional_failures = failure_count - policy.failure_limit - 1
lockout_seconds = policy.lockout_base_seconds
while additional_failures > 0 and lockout_seconds < policy.lockout_max_seconds:
lockout_seconds = min(policy.lockout_max_seconds, lockout_seconds * 2)
additional_failures -= 1
return lockout_seconds
def check_login_throttle(*, username: str, ip_address: str | None) -> LoginThrottleStatus:
"""Returns active login throttle status for the username and source IP identity tuple."""
redis_client = get_redis()
try:
retry_after_seconds = 0
for subject_kind, subject_value in _identity_subjects(username, ip_address):
subject_ttl = int(redis_client.ttl(_lock_key(subject_kind=subject_kind, subject_value=subject_value)))
if subject_ttl == -1:
retry_after_seconds = max(retry_after_seconds, 1)
elif subject_ttl > 0:
retry_after_seconds = max(retry_after_seconds, subject_ttl)
except RedisError as error:
raise RuntimeError("Login throttle backend unavailable") from error
return LoginThrottleStatus(
is_throttled=retry_after_seconds > 0,
retry_after_seconds=retry_after_seconds,
)
def record_failed_login_attempt(*, username: str, ip_address: str | None) -> int:
"""Records one failed login attempt and returns active lockout seconds, if any."""
settings = get_settings()
policy = _resolve_policy(settings)
normalized_username, normalized_ip = _normalize_login_identity(username, ip_address)
redis_client = get_redis()
try:
highest_failure_count = 0
active_lockout_seconds = 0
for subject_kind, subject_value in (
(USERNAME_SUBJECT_KIND, normalized_username),
(IP_SUBJECT_KIND, normalized_ip),
):
failure_key = _failure_key(subject_kind=subject_kind, subject_value=subject_value)
pipeline = redis_client.pipeline(transaction=True)
pipeline.incr(failure_key, 1)
pipeline.expire(failure_key, policy.failure_window_seconds + 5)
count_value, _ = pipeline.execute()
failure_count = int(count_value)
highest_failure_count = max(highest_failure_count, failure_count)
lockout_seconds = _next_lockout_seconds(failure_count=failure_count, policy=policy)
if lockout_seconds > 0:
redis_client.set(
_lock_key(subject_kind=subject_kind, subject_value=subject_value),
"1",
ex=lockout_seconds,
)
active_lockout_seconds = max(active_lockout_seconds, lockout_seconds)
except RedisError as error:
raise RuntimeError("Login throttle backend unavailable") from error
logger.warning(
"Authentication login failure: username=%s ip=%s failed_attempts=%s lockout_seconds=%s",
normalized_username,
normalized_ip,
highest_failure_count,
active_lockout_seconds,
)
return active_lockout_seconds
def clear_login_throttle(*, username: str, ip_address: str | None) -> None:
"""Clears username and source-IP login throttle state after successful authentication."""
normalized_username, normalized_ip = _normalize_login_identity(username, ip_address)
redis_client = get_redis()
keys = [
_failure_key(subject_kind=USERNAME_SUBJECT_KIND, subject_value=normalized_username),
_lock_key(subject_kind=USERNAME_SUBJECT_KIND, subject_value=normalized_username),
_failure_key(subject_kind=IP_SUBJECT_KIND, subject_value=normalized_ip),
_lock_key(subject_kind=IP_SUBJECT_KIND, subject_value=normalized_ip),
]
try:
redis_client.delete(*keys)
except RedisError as error:
raise RuntimeError("Login throttle backend unavailable") from error

View File

@@ -0,0 +1,289 @@
"""Authentication services for user credential validation and session issuance."""
import base64
import binascii
from dataclasses import dataclass
from datetime import UTC, datetime, timedelta
import hashlib
import hmac
import secrets
import uuid
from sqlalchemy import delete, select
from sqlalchemy.orm import Session
from app.core.config import Settings, get_settings
from app.db.base import SessionLocal
from app.models.auth import AppUser, AuthSession, UserRole
PASSWORD_HASH_SCHEME = "pbkdf2_sha256"
DEFAULT_AUTH_FALLBACK_SECRET = "dcm-session-secret"
@dataclass(frozen=True)
class IssuedSession:
"""Represents one newly issued bearer session token and expiration timestamp."""
token: str
expires_at: datetime
def normalize_username(username: str) -> str:
"""Normalizes usernames to a stable lowercase identity key."""
return username.strip().lower()
def _urlsafe_b64encode_no_padding(data: bytes) -> str:
"""Encodes bytes to compact URL-safe base64 without padding."""
return base64.urlsafe_b64encode(data).decode("ascii").rstrip("=")
def _urlsafe_b64decode_no_padding(data: str) -> bytes:
"""Decodes URL-safe base64 values that may omit trailing padding characters."""
padded = data + "=" * (-len(data) % 4)
return base64.urlsafe_b64decode(padded.encode("ascii"))
def _password_iterations(settings: Settings) -> int:
"""Returns PBKDF2 iteration count clamped to a secure operational range."""
return max(200_000, min(1_200_000, int(settings.auth_password_pbkdf2_iterations)))
def hash_password(password: str, settings: Settings | None = None) -> str:
"""Derives and formats a PBKDF2-SHA256 password hash for persisted user credentials."""
resolved_settings = settings or get_settings()
normalized_password = password.strip()
if not normalized_password:
raise ValueError("Password must not be empty")
iterations = _password_iterations(resolved_settings)
salt = secrets.token_bytes(16)
derived = hashlib.pbkdf2_hmac(
"sha256",
normalized_password.encode("utf-8"),
salt,
iterations,
dklen=32,
)
return (
f"{PASSWORD_HASH_SCHEME}$"
f"{iterations}$"
f"{_urlsafe_b64encode_no_padding(salt)}$"
f"{_urlsafe_b64encode_no_padding(derived)}"
)
def verify_password(password: str, stored_hash: str, settings: Settings | None = None) -> bool:
"""Verifies one plaintext password against persisted PBKDF2-SHA256 hash material."""
resolved_settings = settings or get_settings()
normalized_password = password.strip()
if not normalized_password:
return False
parts = stored_hash.strip().split("$")
if len(parts) != 4:
return False
scheme, iterations_text, salt_text, digest_text = parts
if scheme != PASSWORD_HASH_SCHEME:
return False
try:
iterations = int(iterations_text)
except ValueError:
return False
if iterations < 200_000 or iterations > 2_000_000:
return False
try:
salt = _urlsafe_b64decode_no_padding(salt_text)
expected_digest = _urlsafe_b64decode_no_padding(digest_text)
except (binascii.Error, ValueError):
return False
derived_digest = hashlib.pbkdf2_hmac(
"sha256",
normalized_password.encode("utf-8"),
salt,
iterations,
dklen=len(expected_digest),
)
if not hmac.compare_digest(expected_digest, derived_digest):
return False
return iterations >= _password_iterations(resolved_settings)
def _auth_session_secret(settings: Settings) -> bytes:
"""Resolves a stable secret used to hash issued bearer session tokens."""
candidate = settings.auth_session_pepper.strip() or settings.app_settings_encryption_key.strip()
if not candidate:
candidate = DEFAULT_AUTH_FALLBACK_SECRET
return hashlib.sha256(candidate.encode("utf-8")).digest()
def _hash_session_token(token: str, settings: Settings | None = None) -> str:
"""Derives a deterministic SHA256 token hash guarded by secret pepper material."""
resolved_settings = settings or get_settings()
secret = _auth_session_secret(resolved_settings)
digest = hmac.new(secret, token.encode("utf-8"), hashlib.sha256).hexdigest()
return digest
def _new_session_token(settings: Settings) -> str:
"""Creates a random URL-safe bearer token for one API session."""
token_bytes = max(24, min(128, int(settings.auth_session_token_bytes)))
return secrets.token_urlsafe(token_bytes)
def _resolve_optional_user_credentials(username: str, password: str) -> tuple[str, str] | None:
"""Returns optional user credentials only when both username and password are configured."""
normalized_username = normalize_username(username)
normalized_password = password.strip()
if not normalized_username and not normalized_password:
return None
if not normalized_username or not normalized_password:
raise ValueError("Optional bootstrap user requires both username and password")
return normalized_username, normalized_password
def _upsert_bootstrap_user(session: Session, *, username: str, password: str, role: UserRole) -> AppUser:
"""Creates or updates one bootstrap account with deterministic role assignment."""
existing = session.execute(select(AppUser).where(AppUser.username == username)).scalar_one_or_none()
password_hash = hash_password(password)
if existing is None:
user = AppUser(
username=username,
password_hash=password_hash,
role=role,
is_active=True,
)
session.add(user)
return user
existing.password_hash = password_hash
existing.role = role
existing.is_active = True
return existing
def ensure_bootstrap_users() -> None:
"""Creates or refreshes bootstrap user accounts from runtime environment credentials."""
settings = get_settings()
admin_username = normalize_username(settings.auth_bootstrap_admin_username)
admin_password = settings.auth_bootstrap_admin_password.strip()
if not admin_username:
raise RuntimeError("AUTH_BOOTSTRAP_ADMIN_USERNAME must not be empty")
if not admin_password:
raise RuntimeError("AUTH_BOOTSTRAP_ADMIN_PASSWORD must not be empty")
optional_user_credentials = _resolve_optional_user_credentials(
username=settings.auth_bootstrap_user_username,
password=settings.auth_bootstrap_user_password,
)
with SessionLocal() as session:
_upsert_bootstrap_user(
session,
username=admin_username,
password=admin_password,
role=UserRole.ADMIN,
)
if optional_user_credentials is not None:
user_username, user_password = optional_user_credentials
if user_username == admin_username:
raise RuntimeError("AUTH_BOOTSTRAP_USER_USERNAME must differ from admin username")
_upsert_bootstrap_user(
session,
username=user_username,
password=user_password,
role=UserRole.USER,
)
session.commit()
def authenticate_user(session: Session, *, username: str, password: str) -> AppUser | None:
"""Authenticates one username/password pair and returns active account on success."""
normalized_username = normalize_username(username)
if not normalized_username:
return None
user = session.execute(select(AppUser).where(AppUser.username == normalized_username)).scalar_one_or_none()
if user is None or not user.is_active:
return None
if not verify_password(password, user.password_hash):
return None
return user
def issue_user_session(
session: Session,
*,
user: AppUser,
user_agent: str | None = None,
ip_address: str | None = None,
) -> IssuedSession:
"""Issues one new bearer token session for a validated user account."""
settings = get_settings()
now = datetime.now(UTC)
ttl_minutes = max(5, min(7 * 24 * 60, int(settings.auth_session_ttl_minutes)))
expires_at = now + timedelta(minutes=ttl_minutes)
token = _new_session_token(settings)
token_hash = _hash_session_token(token, settings)
session.execute(
delete(AuthSession).where(
AuthSession.user_id == user.id,
AuthSession.expires_at <= now,
)
)
session_entry = AuthSession(
user_id=user.id,
token_hash=token_hash,
expires_at=expires_at,
user_agent=(user_agent or "").strip()[:512] or None,
ip_address=(ip_address or "").strip()[:64] or None,
)
session.add(session_entry)
return IssuedSession(token=token, expires_at=expires_at)
def resolve_auth_session(session: Session, *, token: str) -> AuthSession | None:
"""Resolves one non-revoked and non-expired session from a bearer token value."""
normalized = token.strip()
if not normalized:
return None
token_hash = _hash_session_token(normalized)
now = datetime.now(UTC)
session_entry = session.execute(
select(AuthSession).where(
AuthSession.token_hash == token_hash,
AuthSession.revoked_at.is_(None),
AuthSession.expires_at > now,
)
).scalar_one_or_none()
if session_entry is None or session_entry.user is None:
return None
if not session_entry.user.is_active:
return None
return session_entry
def revoke_auth_session(session: Session, *, session_id: uuid.UUID) -> bool:
"""Revokes one active session by identifier and returns whether a change was applied."""
existing = session.execute(select(AuthSession).where(AuthSession.id == session_id)).scalar_one_or_none()
if existing is None or existing.revoked_at is not None:
return False
existing.revoked_at = datetime.now(UTC)
return True

View File

@@ -299,17 +299,24 @@ def extract_text_content(filename: str, data: bytes, mime_type: str) -> Extracti
) )
def extract_archive_members(data: bytes, depth: int = 0) -> list[ArchiveMember]: def extract_archive_members(data: bytes, depth: int = 0, max_members: int | None = None) -> list[ArchiveMember]:
"""Extracts processable ZIP members within configured decompression safety budgets.""" """Extracts processable ZIP members with depth-aware and decompression safety guardrails."""
members: list[ArchiveMember] = [] members: list[ArchiveMember] = []
if depth > settings.max_zip_depth: normalized_depth = max(0, depth)
if normalized_depth >= settings.max_zip_depth:
return members
member_limit = settings.max_zip_members
if max_members is not None:
member_limit = max(0, min(settings.max_zip_members, int(max_members)))
if member_limit <= 0:
return members return members
total_uncompressed_bytes = 0 total_uncompressed_bytes = 0
try: try:
with zipfile.ZipFile(io.BytesIO(data)) as archive: with zipfile.ZipFile(io.BytesIO(data)) as archive:
infos = [info for info in archive.infolist() if not info.is_dir()][: settings.max_zip_members] infos = [info for info in archive.infolist() if not info.is_dir()][:member_limit]
for info in infos: for info in infos:
if info.file_size <= 0: if info.file_size <= 0:
continue continue

View File

@@ -10,6 +10,7 @@ from typing import Any
from openai import APIConnectionError, APIError, APITimeoutError, OpenAI from openai import APIConnectionError, APIError, APITimeoutError, OpenAI
from PIL import Image, ImageOps from PIL import Image, ImageOps
from app.core.config import normalize_and_validate_provider_base_url
from app.services.app_settings import DEFAULT_OCR_PROMPT, read_handwriting_provider_settings from app.services.app_settings import DEFAULT_OCR_PROMPT, read_handwriting_provider_settings
MAX_IMAGE_SIDE = 2000 MAX_IMAGE_SIDE = 2000
@@ -151,12 +152,17 @@ def _normalize_image_bytes(image_data: bytes) -> tuple[bytes, str]:
def _create_client(provider_settings: dict[str, Any]) -> OpenAI: def _create_client(provider_settings: dict[str, Any]) -> OpenAI:
"""Creates an OpenAI client configured for compatible endpoints and timeouts.""" """Creates an OpenAI client configured with DNS-revalidated endpoint and request timeout controls."""
api_key = str(provider_settings.get("openai_api_key", "")).strip() or "no-key-required" api_key = str(provider_settings.get("openai_api_key", "")).strip() or "no-key-required"
raw_base_url = str(provider_settings.get("openai_base_url", "")).strip()
try:
normalized_base_url = normalize_and_validate_provider_base_url(raw_base_url, resolve_dns=True)
except ValueError as error:
raise HandwritingTranscriptionError(f"invalid_provider_base_url:{error}") from error
return OpenAI( return OpenAI(
api_key=api_key, api_key=api_key,
base_url=str(provider_settings["openai_base_url"]), base_url=normalized_base_url,
timeout=int(provider_settings["openai_timeout_seconds"]), timeout=int(provider_settings["openai_timeout_seconds"]),
) )

View File

@@ -6,10 +6,13 @@ from uuid import UUID
from sqlalchemy import delete, func, select from sqlalchemy import delete, func, select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.models.document import Document from app.models.document import Document
from app.models.processing_log import ProcessingLogEntry from app.models.processing_log import ProcessingLogEntry
settings = get_settings()
MAX_STAGE_LENGTH = 64 MAX_STAGE_LENGTH = 64
MAX_EVENT_LENGTH = 256 MAX_EVENT_LENGTH = 256
MAX_LEVEL_LENGTH = 16 MAX_LEVEL_LENGTH = 16
@@ -37,9 +40,49 @@ def _trim(value: str | None, max_length: int) -> str | None:
def _safe_payload(payload_json: dict[str, Any] | None) -> dict[str, Any]: def _safe_payload(payload_json: dict[str, Any] | None) -> dict[str, Any]:
"""Ensures payload values are persisted as dictionaries.""" """Normalizes payload persistence mode using metadata-only defaults for sensitive content."""
return payload_json if isinstance(payload_json, dict) else {} if not isinstance(payload_json, dict):
return {}
if settings.processing_log_store_payload_text:
return payload_json
return _metadata_only_payload(payload_json)
def _metadata_only_payload(payload_json: dict[str, Any]) -> dict[str, Any]:
"""Converts payload content into metadata descriptors without persisting raw text values."""
metadata: dict[str, Any] = {}
for index, (raw_key, raw_value) in enumerate(payload_json.items()):
if index >= 80:
break
key = str(raw_key)
metadata[key] = _metadata_only_payload_value(raw_value)
return metadata
def _metadata_only_payload_value(value: Any) -> Any:
"""Converts one payload value into non-sensitive metadata representation."""
if isinstance(value, dict):
return _metadata_only_payload(value)
if isinstance(value, (list, tuple)):
items = list(value)
return {
"item_count": len(items),
"items_preview": [_metadata_only_payload_value(item) for item in items[:20]],
}
if isinstance(value, str):
normalized = value.strip()
return {
"text_chars": len(normalized),
"text_omitted": bool(normalized),
}
if isinstance(value, bytes):
return {"binary_bytes": len(value)}
if isinstance(value, (int, float, bool)) or value is None:
return value
return {"value_type": type(value).__name__}
def set_processing_log_autocommit(session: Session, enabled: bool) -> None: def set_processing_log_autocommit(session: Session, enabled: bool) -> None:
@@ -82,8 +125,8 @@ def log_processing_event(
document_filename=_trim(resolved_document_filename, MAX_DOCUMENT_FILENAME_LENGTH), document_filename=_trim(resolved_document_filename, MAX_DOCUMENT_FILENAME_LENGTH),
provider_id=_trim(provider_id, MAX_PROVIDER_LENGTH), provider_id=_trim(provider_id, MAX_PROVIDER_LENGTH),
model_name=_trim(model_name, MAX_MODEL_LENGTH), model_name=_trim(model_name, MAX_MODEL_LENGTH),
prompt_text=_trim(prompt_text, MAX_PROMPT_LENGTH), prompt_text=_trim(prompt_text, MAX_PROMPT_LENGTH) if settings.processing_log_store_model_io_text else None,
response_text=_trim(response_text, MAX_RESPONSE_LENGTH), response_text=_trim(response_text, MAX_RESPONSE_LENGTH) if settings.processing_log_store_model_io_text else None,
payload_json=_safe_payload(payload_json), payload_json=_safe_payload(payload_json),
) )
session.add(entry) session.add(entry)

View File

@@ -0,0 +1,42 @@
"""Redis-backed fixed-window rate limiter helpers for sensitive API operations."""
import time
from redis.exceptions import RedisError
from app.worker.queue import get_redis
def _rate_limit_key(*, scope: str, subject: str, window_id: int) -> str:
"""Builds a stable Redis key for one scope, subject, and fixed time window."""
return f"dcm:rate-limit:{scope}:{subject}:{window_id}"
def increment_rate_limit(
*,
scope: str,
subject: str,
limit: int,
window_seconds: int = 60,
) -> tuple[int, int]:
"""Increments one rate bucket and returns current count with configured limit."""
bounded_limit = max(0, int(limit))
if bounded_limit == 0:
return (0, 0)
bounded_window = max(1, int(window_seconds))
current_window = int(time.time() // bounded_window)
key = _rate_limit_key(scope=scope, subject=subject, window_id=current_window)
redis_client = get_redis()
try:
pipeline = redis_client.pipeline(transaction=True)
pipeline.incr(key, 1)
pipeline.expire(key, bounded_window + 5)
count_value, _ = pipeline.execute()
except RedisError as error:
raise RuntimeError("Rate limiter backend unavailable") from error
return (int(count_value), bounded_limit)

View File

@@ -3,16 +3,17 @@
from redis import Redis from redis import Redis
from rq import Queue from rq import Queue
from app.core.config import get_settings from app.core.config import get_settings, validate_redis_url_security
settings = get_settings() settings = get_settings()
def get_redis() -> Redis: def get_redis() -> Redis:
"""Creates a Redis connection from configured URL.""" """Creates a Redis connection after enforcing URL security policy checks."""
return Redis.from_url(settings.redis_url) secure_redis_url = validate_redis_url_security(settings.redis_url)
return Redis.from_url(secure_redis_url)
def get_processing_queue() -> Queue: def get_processing_queue() -> Queue:

View File

@@ -0,0 +1,26 @@
"""Worker entrypoint that enforces Redis URL security checks before queue consumption."""
from redis import Redis
from rq import Worker
from app.core.config import get_settings, validate_redis_url_security
def _build_worker_connection() -> Redis:
"""Builds validated Redis connection used by RQ worker runtime."""
settings = get_settings()
secure_redis_url = validate_redis_url_security(settings.redis_url)
return Redis.from_url(secure_redis_url)
def run_worker() -> None:
"""Runs the RQ worker loop for the configured DCM processing queue."""
connection = _build_worker_connection()
worker = Worker(["dcm"], connection=connection)
worker.work()
if __name__ == "__main__":
run_worker()

View File

@@ -7,6 +7,7 @@ from pathlib import Path
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.db.base import SessionLocal from app.db.base import SessionLocal
from app.models.document import Document, DocumentStatus from app.models.document import Document, DocumentStatus
from app.services.app_settings import ( from app.services.app_settings import (
@@ -37,6 +38,13 @@ from app.services.storage import absolute_path, compute_sha256, store_bytes, wri
from app.worker.queue import get_processing_queue from app.worker.queue import get_processing_queue
settings = get_settings()
ARCHIVE_ROOT_ID_METADATA_KEY = "archive_root_document_id"
ARCHIVE_DEPTH_METADATA_KEY = "archive_depth"
ARCHIVE_DESCENDANT_COUNT_METADATA_KEY = "archive_descendant_count"
def _cleanup_processing_logs_with_settings(session: Session) -> None: def _cleanup_processing_logs_with_settings(session: Session) -> None:
"""Applies configured processing log retention while trimming old log entries.""" """Applies configured processing log retention while trimming old log entries."""
@@ -48,13 +56,80 @@ def _cleanup_processing_logs_with_settings(session: Session) -> None:
) )
def _metadata_non_negative_int(value: object, fallback: int = 0) -> int:
"""Parses metadata values as non-negative integers with safe fallback behavior."""
try:
parsed = int(value)
except (TypeError, ValueError):
return fallback
return max(0, parsed)
def _metadata_uuid(value: object) -> uuid.UUID | None:
"""Parses metadata values as UUIDs while tolerating malformed legacy values."""
if not isinstance(value, str) or not value.strip():
return None
try:
return uuid.UUID(value.strip())
except ValueError:
return None
def _resolve_archive_lineage(session: Session, document: Document) -> tuple[uuid.UUID, int]:
"""Resolves archive root document id and depth for metadata propagation compatibility."""
metadata_json = dict(document.metadata_json)
metadata_root = _metadata_uuid(metadata_json.get(ARCHIVE_ROOT_ID_METADATA_KEY))
metadata_depth = _metadata_non_negative_int(metadata_json.get(ARCHIVE_DEPTH_METADATA_KEY), fallback=0)
if metadata_root is not None:
return metadata_root, metadata_depth
if not document.is_archive_member:
return document.id, 0
depth = 0
root_document_id = document.id
parent_document_id = document.parent_document_id
visited: set[uuid.UUID] = {document.id}
while parent_document_id is not None and parent_document_id not in visited:
visited.add(parent_document_id)
parent_document = session.execute(select(Document).where(Document.id == parent_document_id)).scalar_one_or_none()
if parent_document is None:
break
depth += 1
root_document_id = parent_document.id
parent_document_id = parent_document.parent_document_id
return root_document_id, depth
def _merge_archive_metadata(document: Document, **updates: object) -> None:
"""Applies archive metadata updates while preserving unrelated document metadata keys."""
metadata_json = dict(document.metadata_json)
metadata_json.update(updates)
document.metadata_json = metadata_json
def _load_archive_root_for_update(session: Session, root_document_id: uuid.UUID) -> Document | None:
"""Loads archive root row with write lock to serialize descendant-count budget updates."""
return session.execute(
select(Document).where(Document.id == root_document_id).with_for_update()
).scalar_one_or_none()
def _create_archive_member_document( def _create_archive_member_document(
parent: Document, parent: Document,
member_name: str, member_name: str,
member_data: bytes, member_data: bytes,
mime_type: str, mime_type: str,
archive_root_document_id: uuid.UUID,
archive_depth: int,
) -> Document: ) -> Document:
"""Creates a child document entity for a file extracted from an uploaded archive.""" """Creates child document entities with lineage metadata for recursive archive processing."""
extension = Path(member_name).suffix.lower() extension = Path(member_name).suffix.lower()
stored_relative_path = store_bytes(member_name, member_data) stored_relative_path = store_bytes(member_name, member_data)
@@ -68,7 +143,13 @@ def _create_archive_member_document(
size_bytes=len(member_data), size_bytes=len(member_data),
logical_path=parent.logical_path, logical_path=parent.logical_path,
tags=list(parent.tags), tags=list(parent.tags),
metadata_json={"origin": "archive", "parent": str(parent.id)}, owner_user_id=parent.owner_user_id,
metadata_json={
"origin": "archive",
"parent": str(parent.id),
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
},
is_archive_member=True, is_archive_member=True,
archived_member_path=member_name, archived_member_path=member_name,
parent_document_id=parent.id, parent_document_id=parent.id,
@@ -110,16 +191,46 @@ def process_document_task(document_id: str) -> None:
if document.extension == ".zip": if document.extension == ".zip":
child_ids: list[str] = [] child_ids: list[str] = []
archive_root_document_id, archive_depth = _resolve_archive_lineage(session=session, document=document)
_merge_archive_metadata(
document,
**{
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
},
)
root_document = _load_archive_root_for_update(session=session, root_document_id=archive_root_document_id)
if root_document is None:
root_document = document
root_metadata_json = dict(root_document.metadata_json)
existing_descendant_count = _metadata_non_negative_int(
root_metadata_json.get(ARCHIVE_DESCENDANT_COUNT_METADATA_KEY),
fallback=0,
)
max_descendants_per_root = max(0, int(settings.max_zip_descendants_per_root))
remaining_descendant_budget = max(0, max_descendants_per_root - existing_descendant_count)
extraction_member_cap = remaining_descendant_budget
log_processing_event( log_processing_event(
session=session, session=session,
stage="archive", stage="archive",
event="Archive extraction started", event="Archive extraction started",
level="info", level="info",
document=document, document=document,
payload_json={"size_bytes": len(data)}, payload_json={
"size_bytes": len(data),
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth,
"remaining_descendant_budget": remaining_descendant_budget,
},
) )
try: try:
members = extract_archive_members(data) members = extract_archive_members(
data,
depth=archive_depth,
max_members=extraction_member_cap,
)
for member in members: for member in members:
mime_type = sniff_mime(member.data) mime_type = sniff_mime(member.data)
child = _create_archive_member_document( child = _create_archive_member_document(
@@ -127,6 +238,8 @@ def process_document_task(document_id: str) -> None:
member_name=member.name, member_name=member.name,
member_data=member.data, member_data=member.data,
mime_type=mime_type, mime_type=mime_type,
archive_root_document_id=archive_root_document_id,
archive_depth=archive_depth + 1,
) )
session.add(child) session.add(child)
session.flush() session.flush()
@@ -142,8 +255,27 @@ def process_document_task(document_id: str) -> None:
"member_name": member.name, "member_name": member.name,
"member_size_bytes": len(member.data), "member_size_bytes": len(member.data),
"mime_type": mime_type, "mime_type": mime_type,
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth + 1,
}, },
) )
updated_root_metadata = dict(root_document.metadata_json)
updated_root_metadata[ARCHIVE_ROOT_ID_METADATA_KEY] = str(archive_root_document_id)
updated_root_metadata[ARCHIVE_DEPTH_METADATA_KEY] = 0
updated_root_metadata[ARCHIVE_DESCENDANT_COUNT_METADATA_KEY] = existing_descendant_count + len(child_ids)
root_document.metadata_json = updated_root_metadata
limit_flags: dict[str, object] = {}
if archive_depth >= settings.max_zip_depth:
limit_flags["max_depth_reached"] = True
if remaining_descendant_budget <= 0:
limit_flags["max_descendants_reached"] = True
elif len(child_ids) >= remaining_descendant_budget:
limit_flags["max_descendants_reached"] = True
if limit_flags:
_merge_archive_metadata(document, **limit_flags)
document.status = DocumentStatus.PROCESSED document.status = DocumentStatus.PROCESSED
document.extracted_text = f"archive with {len(members)} files" document.extracted_text = f"archive with {len(members)} files"
log_processing_event( log_processing_event(
@@ -152,7 +284,13 @@ def process_document_task(document_id: str) -> None:
event="Archive extraction completed", event="Archive extraction completed",
level="info", level="info",
document=document, document=document,
payload_json={"member_count": len(members)}, payload_json={
"member_count": len(members),
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth,
"descendant_count": existing_descendant_count + len(child_ids),
"remaining_descendant_budget": max(0, remaining_descendant_budget - len(child_ids)),
},
) )
except Exception as exc: except Exception as exc:
document.status = DocumentStatus.ERROR document.status = DocumentStatus.ERROR
@@ -231,7 +369,10 @@ def process_document_task(document_id: str) -> None:
event="Archive child job enqueued", event="Archive child job enqueued",
level="info", level="info",
document_id=uuid.UUID(child_id), document_id=uuid.UUID(child_id),
payload_json={"parent_document_id": str(document.id)}, payload_json={
"parent_document_id": str(document.id),
"archive_root_document_id": str(archive_root_document_id),
},
) )
session.commit() session.commit()
return return

View File

@@ -16,3 +16,4 @@ orjson==3.11.3
openai==1.107.2 openai==1.107.2
typesense==1.1.1 typesense==1.1.1
tiktoken==0.11.0 tiktoken==0.11.0
cryptography==46.0.1

View File

@@ -144,6 +144,87 @@ class AppSettingsProviderResilienceTests(unittest.TestCase):
app_settings.update_app_settings(providers=[provider_update]) app_settings.update_app_settings(providers=[provider_update])
write_settings_mock.assert_not_called() write_settings_mock.assert_not_called()
def test_sanitize_settings_migrates_legacy_plaintext_api_key_to_encrypted_field(self) -> None:
"""Legacy plaintext API keys are still readable and emitted with encrypted storage representation."""
payload = {
"providers": [
{
"id": "secure-provider",
"label": "Secure Provider",
"provider_type": "openai_compatible",
"base_url": "https://api.openai.com/v1",
"timeout_seconds": 45,
"api_key": "legacy-plaintext-secret",
}
],
"tasks": {
app_settings.TASK_OCR_HANDWRITING: {"provider_id": "secure-provider"},
app_settings.TASK_SUMMARY_GENERATION: {"provider_id": "secure-provider"},
app_settings.TASK_ROUTING_CLASSIFICATION: {"provider_id": "secure-provider"},
},
}
with patch.object(app_settings, "_derive_provider_api_key_key", return_value=b"k" * 32):
sanitized = app_settings._sanitize_settings(payload)
provider = sanitized["providers"][0]
self.assertEqual(provider["api_key"], "legacy-plaintext-secret")
self.assertTrue(
str(provider.get("api_key_encrypted", "")).startswith(
f"{app_settings.PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:"
)
)
def test_serialize_settings_for_storage_excludes_plaintext_api_key(self) -> None:
"""Storage payload serialization persists encrypted provider API keys only."""
payload = _sample_current_payload()
payload["providers"][0]["api_key"] = "storage-secret"
payload["providers"][0]["api_key_encrypted"] = ""
with patch.object(app_settings, "_derive_provider_api_key_key", return_value=b"s" * 32):
storage_payload = app_settings._serialize_settings_for_storage(payload)
provider_storage = storage_payload["providers"][0]
self.assertNotIn("api_key", provider_storage)
self.assertTrue(
str(provider_storage.get("api_key_encrypted", "")).startswith(
f"{app_settings.PROVIDER_API_KEY_CIPHERTEXT_PREFIX}:"
)
)
def test_read_handwriting_provider_settings_revalidates_dns(self) -> None:
"""OCR runtime provider settings enforce DNS revalidation before creating outbound clients."""
runtime_payload = {
"provider": {
"id": "openai-default",
"provider_type": "openai_compatible",
"base_url": "https://api.openai.com/v1",
"timeout_seconds": 45,
"api_key": "runtime-secret",
},
"task": {
"enabled": True,
"model": "gpt-4.1-mini",
"prompt": "prompt",
},
}
with (
patch.object(app_settings, "read_task_runtime_settings", return_value=runtime_payload),
patch.object(
app_settings,
"normalize_and_validate_provider_base_url",
return_value="https://api.openai.com/v1",
) as normalize_mock,
):
runtime_settings = app_settings.read_handwriting_provider_settings()
normalize_mock.assert_called_once_with("https://api.openai.com/v1", resolve_dns=True)
self.assertEqual(runtime_settings["openai_base_url"], "https://api.openai.com/v1")
self.assertEqual(runtime_settings["openai_api_key"], "runtime-secret")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

File diff suppressed because it is too large Load Diff

View File

@@ -29,6 +29,7 @@ def _install_main_import_stubs() -> dict[str, ModuleType | None]:
"app.core.config", "app.core.config",
"app.db.base", "app.db.base",
"app.services.app_settings", "app.services.app_settings",
"app.services.authentication",
"app.services.handwriting_style", "app.services.handwriting_style",
"app.services.storage", "app.services.storage",
"app.services.typesense_index", "app.services.typesense_index",
@@ -115,6 +116,7 @@ def _install_main_import_stubs() -> dict[str, ModuleType | None]:
"""Returns minimal settings consumed by app.main during test import.""" """Returns minimal settings consumed by app.main during test import."""
return SimpleNamespace( return SimpleNamespace(
app_env="development",
cors_origins=["http://localhost:5173"], cors_origins=["http://localhost:5173"],
max_upload_request_size_bytes=1024, max_upload_request_size_bytes=1024,
) )
@@ -138,6 +140,14 @@ def _install_main_import_stubs() -> dict[str, ModuleType | None]:
app_settings_stub.ensure_app_settings = ensure_app_settings app_settings_stub.ensure_app_settings = ensure_app_settings
sys.modules["app.services.app_settings"] = app_settings_stub sys.modules["app.services.app_settings"] = app_settings_stub
authentication_stub = ModuleType("app.services.authentication")
def ensure_bootstrap_users() -> None:
"""No-op bootstrap user initializer for middleware scope tests."""
authentication_stub.ensure_bootstrap_users = ensure_bootstrap_users
sys.modules["app.services.authentication"] = authentication_stub
handwriting_style_stub = ModuleType("app.services.handwriting_style") handwriting_style_stub = ModuleType("app.services.handwriting_style")
def ensure_handwriting_style_collection() -> None: def ensure_handwriting_style_collection() -> None:

View File

@@ -6,7 +6,8 @@ This directory contains technical documentation for DMS.
- `../README.md` - project overview, setup, and quick operations - `../README.md` - project overview, setup, and quick operations
- `architecture-overview.md` - backend, frontend, and infrastructure architecture - `architecture-overview.md` - backend, frontend, and infrastructure architecture
- `api-contract.md` - API endpoint contract grouped by route module, including token auth roles, upload limits, and settings or processing-log security constraints - `api-contract.md` - API endpoint contract grouped by route module, including session auth, login throttle responses, role and ownership scope, upload limits, and settings or processing-log security constraints
- `data-model-reference.md` - database entity definitions and lifecycle states - `data-model-reference.md` - database entity definitions and lifecycle states
- `operations-and-configuration.md` - runtime operations, hardened compose defaults, security environment variables, and persisted settings configuration and read-sanitization behavior - `operations-and-configuration.md` - runtime operations, hardened compose defaults, DEV and LIVE security values, persisted settings configuration behavior, and frontend Vite host allowlist controls
- `frontend-design-foundation.md` - frontend visual system, tokens, UI implementation rules, authenticated media delivery under API token auth, processing-log timeline behavior, and settings helper-copy guidance - `frontend-design-foundation.md` - frontend visual system, tokens, UI implementation rules, authenticated media delivery under session auth, processing-log timeline behavior, and settings helper-copy guidance
- `../.env.example` - repository-level environment template with local defaults and production override guidance

View File

@@ -4,6 +4,7 @@ Base URL prefix: `/api/v1`
Primary implementation modules: Primary implementation modules:
- `backend/app/api/router.py` - `backend/app/api/router.py`
- `backend/app/api/routes_auth.py`
- `backend/app/api/routes_health.py` - `backend/app/api/routes_health.py`
- `backend/app/api/routes_documents.py` - `backend/app/api/routes_documents.py`
- `backend/app/api/routes_search.py` - `backend/app/api/routes_search.py`
@@ -12,15 +13,39 @@ Primary implementation modules:
## Authentication And Authorization ## Authentication And Authorization
- Protected endpoints require `Authorization: Bearer <token>`. - Authentication is cookie-based session auth with a server-issued hashed session token.
- `ADMIN_API_TOKEN` is required for all privileged access and acts as fail-closed root credential. - Clients authenticate with `POST /auth/login` using username and password.
- `USER_API_TOKEN` is optional and, when configured, grants access to document endpoints only. - Backend issues a server-stored session token and sets `HttpOnly` `dcm_session` and readable `dcm_csrf` cookies.
- Authorization matrix: - Login brute-force protection enforces Redis-backed throttle checks keyed by username and source IP.
- State-changing requests from browser clients must send `x-csrf-token: <dcm_csrf>` in request headers (double-submit pattern).
- For non-browser API clients, the optional `Authorization: Bearer <token>` path remains supported when the token is sent explicitly.
- `GET /auth/me` returns current identity, role, and current CSRF token.
- `POST /auth/logout` revokes current session token.
Role matrix:
- `documents/*`: `admin` or `user` - `documents/*`: `admin` or `user`
- `search/*`: `admin` or `user` - `search/*`: `admin` or `user`
- `settings/*`: `admin` only - `settings/*`: `admin` only
- `processing/logs/*`: `admin` only - `processing/logs/*`: `admin` only
Ownership rules:
- `user` role is restricted to its own documents.
- `admin` role can access all documents.
## Auth
- `POST /auth/login`
- Body model: `AuthLoginRequest`
- Response model: `AuthLoginResponse`
- Additional responses:
- `401` for invalid credentials
- `429` for throttled login attempts, with stable message and `Retry-After` header
- `503` when the login rate-limiter backend is unavailable
- `GET /auth/me`
- Response model: `AuthSessionResponse`
- `POST /auth/logout`
- Response model: `AuthLogoutResponse`
## Health ## Health
- `GET /health` - `GET /health`
@@ -29,8 +54,6 @@ Primary implementation modules:
## Documents ## Documents
- Access: admin or user token required
### Collection and metadata helpers ### Collection and metadata helpers
- `GET /documents` - `GET /documents`
@@ -39,15 +62,26 @@ Primary implementation modules:
- `GET /documents/tags` - `GET /documents/tags`
- Query: `include_trashed` - Query: `include_trashed`
- Response: `{ "tags": string[] }` - Response: `{ "tags": string[] }`
- Behavior:
- all document-assigned tags visible to caller scope are included
- predefined tags are role-filtered: `admin` receives full catalog, `user` receives only entries with `global_shared=true`
- `GET /documents/paths` - `GET /documents/paths`
- Query: `include_trashed` - Query: `include_trashed`
- Response: `{ "paths": string[] }` - Response: `{ "paths": string[] }`
- Behavior:
- all document-assigned logical paths visible to caller scope are included
- predefined paths are role-filtered: `admin` receives full catalog, `user` receives only entries with `global_shared=true`
- `GET /documents/types` - `GET /documents/types`
- Query: `include_trashed` - Query: `include_trashed`
- Response: `{ "types": string[] }` - Response: `{ "types": string[] }`
- `POST /documents/content-md/export` - `POST /documents/content-md/export`
- Body model: `ContentExportRequest` - Body model: `ContentExportRequest`
- Response: ZIP stream containing one markdown file per matched document - Response: ZIP stream containing one markdown file per matched document
- Limits:
- hard cap on matched document count (`CONTENT_EXPORT_MAX_DOCUMENTS`)
- hard cap on cumulative markdown bytes (`CONTENT_EXPORT_MAX_TOTAL_BYTES`)
- per-user rate limit (`CONTENT_EXPORT_RATE_LIMIT_PER_MINUTE`)
- Behavior: archive is streamed from spool file instead of unbounded in-memory buffer
### Per-document operations ### Per-document operations
@@ -56,7 +90,8 @@ Primary implementation modules:
- `GET /documents/{document_id}/download` - `GET /documents/{document_id}/download`
- Response: original file bytes - Response: original file bytes
- `GET /documents/{document_id}/preview` - `GET /documents/{document_id}/preview`
- Response: inline preview stream where browser-supported - Response: inline preview stream only for safe MIME types
- Behavior: script-capable MIME types are forced to attachment responses with `X-Content-Type-Options: nosniff`
- `GET /documents/{document_id}/thumbnail` - `GET /documents/{document_id}/thumbnail`
- Response: generated thumbnail image when available - Response: generated thumbnail image when available
- `GET /documents/{document_id}/content-md` - `GET /documents/{document_id}/content-md`
@@ -86,7 +121,7 @@ Primary implementation modules:
- `conflict_mode` (`ask`, `replace`, `duplicate`) - `conflict_mode` (`ask`, `replace`, `duplicate`)
- Response model: `UploadResponse` - Response model: `UploadResponse`
- Behavior: - Behavior:
- `ask`: returns `conflicts` if duplicate checksum is detected - `ask`: returns `conflicts` if duplicate checksum is detected for caller-visible documents
- `replace`: creates new document linked to replaced document id - `replace`: creates new document linked to replaced document id
- `duplicate`: creates additional document record - `duplicate`: creates additional document record
- upload `POST` request rejected with `411` when `Content-Length` is missing - upload `POST` request rejected with `411` when `Content-Length` is missing
@@ -95,16 +130,14 @@ Primary implementation modules:
## Search ## Search
- Access: admin or user token required
- `GET /search` - `GET /search`
- Query: `query` (min length 2), `offset`, `limit`, `include_trashed`, `only_trashed`, `path_filter`, `tag_filter`, `type_filter`, `processed_from`, `processed_to` - Query: `query` (min length 2), `offset`, `limit`, `include_trashed`, `only_trashed`, `path_filter`, `tag_filter`, `type_filter`, `processed_from`, `processed_to`
- Response model: `SearchResponse` - Response model: `SearchResponse`
- Behavior: PostgreSQL full-text and metadata ranking - Behavior: PostgreSQL full-text and metadata ranking with role-based ownership scope
## Processing Logs ## Processing Logs
- Access: admin token required - Access: admin only
- `GET /processing/logs` - `GET /processing/logs`
- Query: `offset`, `limit`, `document_id` - Query: `offset`, `limit`, `document_id`
@@ -119,17 +152,23 @@ Primary implementation modules:
- `POST /processing/logs/clear` - `POST /processing/logs/clear`
- Response: clear counters - Response: clear counters
Persistence mode:
- default is metadata-only logging (`PROCESSING_LOG_STORE_MODEL_IO_TEXT=false`, `PROCESSING_LOG_STORE_PAYLOAD_TEXT=false`)
- full prompt/response or payload content storage requires explicit operator opt-in
## Settings ## Settings
- Access: admin token required - Access: admin only
- `GET /settings` - `GET /settings`
- Response model: `AppSettingsResponse` - Response model: `AppSettingsResponse`
- persisted providers with invalid base URLs are ignored during read sanitization; response falls back to remaining valid providers or secure defaults - persisted providers with invalid base URLs are ignored during read sanitization; response falls back to remaining valid providers or secure defaults
- provider API keys are exposed only as `api_key_set` and `api_key_masked`
- `PATCH /settings` - `PATCH /settings`
- Body model: `AppSettingsUpdateRequest` - Body model: `AppSettingsUpdateRequest`
- Response model: `AppSettingsResponse` - Response model: `AppSettingsResponse`
- rejects invalid provider base URLs with `400` when scheme, allowlist, or network safety checks fail - rejects invalid provider base URLs with `400` when scheme, allowlist, or network safety checks fail
- provider API keys are persisted encrypted at rest (`api_key_encrypted`) and plaintext keys are not written to storage
- `POST /settings/reset` - `POST /settings/reset`
- Response model: `AppSettingsResponse` - Response model: `AppSettingsResponse`
- `PATCH /settings/handwriting` - `PATCH /settings/handwriting`
@@ -140,6 +179,13 @@ Primary implementation modules:
## Schema Families ## Schema Families
Auth schemas in `backend/app/schemas/auth.py`:
- `AuthLoginRequest`
- `AuthUserResponse`
- `AuthSessionResponse`
- `AuthLoginResponse`
- `AuthLogoutResponse`
Document schemas in `backend/app/schemas/documents.py`: Document schemas in `backend/app/schemas/documents.py`:
- `DocumentResponse` - `DocumentResponse`
- `DocumentDetailResponse` - `DocumentDetailResponse`
@@ -155,4 +201,4 @@ Processing log schemas in `backend/app/schemas/processing_logs.py`:
- `ProcessingLogListResponse` - `ProcessingLogListResponse`
Settings schemas in `backend/app/schemas/settings.py`: Settings schemas in `backend/app/schemas/settings.py`:
- Provider, task, upload-default, display, processing-log retention, predefined paths or tags, handwriting-style, and legacy handwriting models grouped under `AppSettingsResponse` and `AppSettingsUpdateRequest`. - provider, task, upload-default, display, processing-log retention, predefined paths or tags, handwriting-style, and legacy handwriting models grouped under `AppSettingsResponse` and `AppSettingsUpdateRequest`.

View File

@@ -6,9 +6,9 @@ DMS runs as a multi-service application defined in `docker-compose.yml`:
- `frontend` serves the React UI on port `5173` - `frontend` serves the React UI on port `5173`
- `api` serves FastAPI on port `8000` - `api` serves FastAPI on port `8000`
- `worker` executes asynchronous extraction and indexing jobs - `worker` executes asynchronous extraction and indexing jobs
- `db` provides PostgreSQL persistence on port `5432` - `db` provides PostgreSQL persistence on the internal compose network
- `redis` backs queueing on port `6379` - `redis` backs queueing on the internal compose network
- `typesense` stores search index and vector-adjacent metadata on port `8108` - `typesense` stores search index and vector-adjacent metadata on the internal compose network
## Backend Architecture ## Backend Architecture
@@ -16,16 +16,16 @@ Backend source root: `backend/app/`
Main boundaries: Main boundaries:
- `api/` route handlers and HTTP contract - `api/` route handlers and HTTP contract
- `services/` domain logic (storage, extraction, routing, settings, processing logs, Typesense) - `services/` domain logic (authentication, storage, extraction, routing, settings, processing logs, Typesense)
- `db/` SQLAlchemy base, engine, and session lifecycle - `db/` SQLAlchemy base, engine, and session lifecycle
- `models/` persistence entities (`Document`, `ProcessingLogEntry`) - `models/` persistence entities (`AppUser`, `AuthSession`, `Document`, `ProcessingLogEntry`)
- `schemas/` Pydantic response and request schemas - `schemas/` Pydantic response and request schemas
- `worker/` RQ queue integration and background processing tasks - `worker/` RQ queue integration and background processing tasks
Application bootstrap in `backend/app/main.py`: Application bootstrap in `backend/app/main.py`:
- mounts routers under `/api/v1` - mounts routers under `/api/v1`
- configures CORS from settings - configures CORS from settings
- initializes storage, settings, database schema, and Typesense collection on startup - initializes storage, database schema, bootstrap users, settings, and Typesense collection on startup
## Processing Lifecycle ## Processing Lifecycle
@@ -48,11 +48,12 @@ Core structure:
- `design-foundation.css` and `styles.css` define design tokens and global/component styling - `design-foundation.css` and `styles.css` define design tokens and global/component styling
Main user flows: Main user flows:
- Login and role-gated navigation (`admin` and `user`)
- Upload and conflict resolution - Upload and conflict resolution
- Search and filtered document browsing - Search and filtered document browsing
- Metadata editing and lifecycle actions (trash, restore, delete, reprocess) - Metadata editing and lifecycle actions (trash, restore, delete, reprocess)
- Settings management for providers, tasks, and UI defaults - Settings management for providers, tasks, and UI defaults (admin only)
- Processing log review - Processing log review (admin only)
## Persistence and State ## Persistence and State
@@ -64,3 +65,13 @@ Persistent data:
Transient runtime state: Transient runtime state:
- Redis queues processing tasks and worker execution state - Redis queues processing tasks and worker execution state
- frontend local component state drives active filters, selection, and modal flows - frontend local component state drives active filters, selection, and modal flows
Security-sensitive runtime behavior:
- API access is session-based with per-user server-issued bearer tokens and role checks.
- Document and search reads for `user` role are owner-scoped via `owner_user_id`; `admin` can access global scope.
- Redis connection URLs are validated by backend queue helpers with environment-aware auth and TLS policy enforcement.
- Worker startup runs through `python -m app.worker.run_worker`, which validates Redis URL policy before queue consumption.
- Inline preview is limited to safe MIME types and script-capable content is served as attachment-only.
- Archive fan-out processing propagates root and depth lineage metadata and enforces depth and per-root descendant caps.
- Markdown export applies per-user rate limits, hard document-count and total-byte caps, and spool-file streaming.
- Processing logs default to metadata-only persistence, with explicit operator toggles required to store model IO text.

View File

@@ -2,6 +2,38 @@
Primary SQLAlchemy models are defined in `backend/app/models/`. Primary SQLAlchemy models are defined in `backend/app/models/`.
## app_users
Model: `AppUser` in `backend/app/models/auth.py`
Purpose:
- Stores authenticatable user identities for session-based API access.
Core fields:
- Identity and credentials: `id`, `username`, `password_hash`
- Authorization and lifecycle: `role`, `is_active`
- Audit timestamps: `created_at`, `updated_at`
Enum `UserRole`:
- `admin`
- `user`
## auth_sessions
Model: `AuthSession` in `backend/app/models/auth.py`
Purpose:
- Stores issued bearer sessions linked to user identities.
Core fields:
- Identity and linkage: `id`, `user_id`, `token_hash`
- Session lifecycle: `expires_at`, `revoked_at`
- Request context: `user_agent`, `ip_address`
- Audit timestamps: `created_at`, `updated_at`
Foreign keys:
- `user_id` references `app_users.id` with `ON DELETE CASCADE`.
## documents ## documents
Model: `Document` in `backend/app/models/document.py` Model: `Document` in `backend/app/models/document.py`
@@ -12,7 +44,7 @@ Purpose:
Core fields: Core fields:
- Identity and source: `id`, `original_filename`, `source_relative_path`, `stored_relative_path` - Identity and source: `id`, `original_filename`, `source_relative_path`, `stored_relative_path`
- File attributes: `mime_type`, `extension`, `sha256`, `size_bytes` - File attributes: `mime_type`, `extension`, `sha256`, `size_bytes`
- Organization: `logical_path`, `suggested_path`, `tags`, `suggested_tags` - Ownership and organization: `owner_user_id`, `logical_path`, `suggested_path`, `tags`, `suggested_tags`
- Processing outputs: `extracted_text`, `image_text_type`, `handwriting_style_id`, `preview_available` - Processing outputs: `extracted_text`, `image_text_type`, `handwriting_style_id`, `preview_available`
- Lifecycle and relations: `status`, `is_archive_member`, `archived_member_path`, `parent_document_id`, `replaces_document_id` - Lifecycle and relations: `status`, `is_archive_member`, `archived_member_path`, `parent_document_id`, `replaces_document_id`
- Metadata and timestamps: `metadata_json`, `created_at`, `processed_at`, `updated_at` - Metadata and timestamps: `metadata_json`, `created_at`, `processed_at`, `updated_at`
@@ -24,8 +56,12 @@ Enum `DocumentStatus`:
- `error` - `error`
- `trashed` - `trashed`
Foreign keys:
- `owner_user_id` references `app_users.id` with `ON DELETE SET NULL`.
Relationships: Relationships:
- Self-referential `parent_document` relationship for archive extraction trees. - Self-referential `parent_document` relationship for archive extraction trees.
- `owner_user` relationship to `AppUser`.
## processing_logs ## processing_logs
@@ -47,7 +83,10 @@ Foreign keys:
## Model Lifecycle Notes ## Model Lifecycle Notes
- Upload inserts a `Document` row in `queued` state and enqueues background processing. - API startup initializes schema and creates or refreshes bootstrap users from auth environment variables.
- Worker updates extraction results and final status (`processed`, `unsupported`, or `error`). - `POST /auth/login` validates `AppUser` credentials, creates `AuthSession` with hashed token, and returns bearer token once.
- Upload inserts `Document` row in `queued` state, assigns `owner_user_id`, and enqueues background processing.
- Worker updates extraction results and final status (`processed`, `unsupported`, or `error`), preserving ownership on archive descendants.
- User-role queries are owner-scoped; admin-role queries can access all documents.
- Trash and restore operations toggle `status` while preserving source files until permanent delete. - Trash and restore operations toggle `status` while preserving source files until permanent delete.
- Permanent delete removes the document tree (including archive descendants) and associated stored files. - Permanent delete removes the document tree (including archive descendants) and associated stored files.

View File

@@ -52,9 +52,12 @@ Do not hardcode new palette or spacing values in component styles when a token a
## Authenticated Media Delivery ## Authenticated Media Delivery
- Document previews and thumbnails must load through authenticated fetch flows in `frontend/src/lib/api.ts`, then render via temporary object URLs. - Document previews and thumbnails must load through authenticated fetch flows in `frontend/src/lib/api.ts`, then render via temporary object URLs.
- Runtime auth is cookie-backed; valid sessions are reused by browser reload and tab reuse while the `dcm_session` cookie remains valid.
- Static build-time token distribution is not supported.
- Direct `window.open` calls for protected media endpoints are not allowed because browser navigation requests do not include the API token header. - Direct `window.open` calls for protected media endpoints are not allowed because browser navigation requests do not include the API token header.
- Download actions for original files and markdown exports must use authenticated blob fetches plus controlled browser download triggers. - Download actions for original files and markdown exports must use authenticated blob fetches plus controlled browser download triggers.
- Revoke all temporary object URLs after replacement, unmount, or completion to prevent browser memory leaks. - Revoke all temporary object URLs after replacement, unmount, or completion to prevent browser memory leaks.
- `DocumentViewer` iframe previews must be restricted to safe MIME types and rendered with `sandbox`, restrictive `allow`, and `referrerPolicy="no-referrer"` attributes. Active or script-capable formats must not be embedded inline.
## Extension Checklist ## Extension Checklist

View File

@@ -2,37 +2,36 @@
## Runtime Services ## Runtime Services
`docker-compose.yml` defines the runtime stack: `docker-compose.yml` defines:
- `db` (Postgres 16, localhost-bound port `5432`) - `db` (Postgres 16)
- `redis` (Redis 7, localhost-bound port `6379`) - `redis` (Redis 7)
- `typesense` (Typesense 29, localhost-bound port `8108`) - `typesense` (Typesense 29)
- `api` (FastAPI backend, localhost-bound port `8000`) - `api` (FastAPI backend)
- `worker` (RQ background worker) - `worker` (RQ worker via `python -m app.worker.run_worker`)
- `frontend` (Vite UI, localhost-bound port `5173`) - `frontend` (Vite React UI)
## Named Volumes Persistent host bind mounts (default root `./data`, overridable with `DCM_DATA_DIR`):
- `${DCM_DATA_DIR:-./data}/db-data`
Persistent volumes: - `${DCM_DATA_DIR:-./data}/redis-data`
- `db-data` - `${DCM_DATA_DIR:-./data}/storage`
- `redis-data` - `${DCM_DATA_DIR:-./data}/typesense-data`
- `dcm-storage`
- `typesense-data`
Reset all persisted runtime data: Reset all persisted runtime data:
```bash ```bash
docker compose down -v docker compose down
rm -rf ${DCM_DATA_DIR:-./data}
``` ```
## Operational Commands ## Core Commands
Start or rebuild stack: Start or rebuild:
```bash ```bash
docker compose up --build -d docker compose up --build -d
``` ```
Stop stack: Stop:
```bash ```bash
docker compose down docker compose down
@@ -44,124 +43,127 @@ Tail logs:
docker compose logs -f docker compose logs -f
``` ```
Before running compose, provide explicit API tokens in your shell or project `.env` file: ## Host Bind Mounts
Compose is configured with host bind mounts for persistent data. Ensure host directories exist and are writable by the backend runtime user.
Backend and worker run as non-root user `uid=10001` inside containers. Compose bootstraps the storage bind mount through the one-shot `storage-init` service before either process starts. For manual inspection or repair of host-mounted storage paths:
```bash ```bash
export ADMIN_API_TOKEN="<random-admin-token>" mkdir -p ${DCM_DATA_DIR:-./data}/storage
export USER_API_TOKEN="<random-user-token>" sudo chown -R 10001:10001 ${DCM_DATA_DIR:-./data}/storage
sudo chmod -R u+rwX,g+rwX ${DCM_DATA_DIR:-./data}/storage
``` ```
Compose now fails fast if either token variable is missing. If permissions are incorrect, API startup fails with errors similar to:
- `PermissionError: [Errno 13] Permission denied: '/data/storage'`
- `FileNotFoundError` for `/data/storage/originals`
## Backend Configuration ## Frontend Build Baseline
Settings source: The frontend Dockerfile uses `node:22-slim` with a standard `npm ci --no-audit` install step and no npm-specific build tuning flags.
- Runtime settings class: `backend/app/core/config.py`
- API settings persistence: `backend/app/services/app_settings.py`
Key environment variables used by `api` and `worker` in compose: ## Authentication Model
- `APP_ENV`
- `DATABASE_URL`
- `REDIS_URL`
- `STORAGE_ROOT`
- `ADMIN_API_TOKEN`
- `USER_API_TOKEN`
- `PUBLIC_BASE_URL`
- `CORS_ORIGINS` (API service)
- `PROVIDER_BASE_URL_ALLOWLIST`
- `PROVIDER_BASE_URL_ALLOW_HTTP`
- `PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK`
- `TYPESENSE_PROTOCOL`
- `TYPESENSE_HOST`
- `TYPESENSE_PORT`
- `TYPESENSE_API_KEY`
- `TYPESENSE_COLLECTION_NAME`
Selected defaults from `Settings` (`backend/app/core/config.py`): - Legacy shared build-time frontend token behavior was removed.
- `upload_chunk_size = 4194304` - API now uses server-issued sessions that are stored in HttpOnly cookies (`dcm_session`) with a separate CSRF cookie (`dcm_csrf`).
- `max_upload_files_per_request = 50` - Bootstrap users are provisioned from environment:
- `max_upload_file_size_bytes = 26214400` - `AUTH_BOOTSTRAP_ADMIN_USERNAME`
- `max_upload_request_size_bytes = 104857600` - `AUTH_BOOTSTRAP_ADMIN_PASSWORD`
- `max_zip_members = 250` - optional `AUTH_BOOTSTRAP_USER_USERNAME`
- `max_zip_depth = 2` - optional `AUTH_BOOTSTRAP_USER_PASSWORD`
- `max_zip_member_uncompressed_bytes = 26214400` - Login brute-force protection is enabled by default and keyed by username and source IP:
- `max_zip_total_uncompressed_bytes = 157286400` - `AUTH_LOGIN_FAILURE_LIMIT`
- `max_zip_compression_ratio = 120.0` - `AUTH_LOGIN_FAILURE_WINDOW_SECONDS`
- `max_text_length = 500000` - `AUTH_LOGIN_LOCKOUT_BASE_SECONDS`
- `processing_log_max_document_sessions = 20` - `AUTH_LOGIN_LOCKOUT_MAX_SECONDS`
- `processing_log_max_unbound_entries = 400` - Frontend signs in through `/api/v1/auth/login` and relies on browser session persistence for valid cookie-backed sessions.
- `default_openai_model = "gpt-4.1-mini"`
- `default_openai_timeout_seconds = 45`
- `default_summary_model = "gpt-4.1-mini"`
- `default_routing_model = "gpt-4.1-mini"`
- `typesense_timeout_seconds = 120`
- `typesense_num_retries = 0`
## Frontend Configuration ## DEV And LIVE Configuration Matrix
Frontend runtime API target: Use `.env.example` as baseline. The table below documents user-managed settings and recommended values.
- `VITE_API_BASE` in `docker-compose.yml` frontend service
- `VITE_API_TOKEN` in `docker-compose.yml` frontend service (defaults to `USER_API_TOKEN` in compose, override to `ADMIN_API_TOKEN` when admin-only routes are needed)
Frontend API authentication behavior: | Variable | Local DEV (HTTP, docker-only) | LIVE (HTTPS behind reverse proxy) |
- `frontend/src/lib/api.ts` adds `Authorization: Bearer <VITE_API_TOKEN>` for all API requests only when `VITE_API_TOKEN` is non-empty | --- | --- | --- |
- requests are still sent without authorization when `VITE_API_TOKEN` is unset, which keeps unauthenticated endpoints such as `/api/v1/health` backward-compatible | `APP_ENV` | `development` | `production` |
| `HOST_BIND_IP` | `127.0.0.1` or local LAN bind if needed | `127.0.0.1` (publish behind proxy only) |
| `PUBLIC_BASE_URL` | `http://localhost:8000` or same-origin frontend host when proxying API through frontend | `https://app.example.com` when frontend proxies `/api`, or dedicated API origin if you intentionally keep split-origin routing |
| `VITE_API_BASE` | empty to use same-origin `/api/v1` through frontend proxy, or explicit local URL when bypassing proxy | empty or `/api/v1` for same-origin production routing; only use `https://api.example.com/api/v1` when you intentionally keep split-origin frontend/API traffic |
| `VITE_ALLOWED_HOSTS` | optional comma-separated hostnames, for example `localhost,docs.lan` | optional comma-separated public frontend hostnames, for example `app.example.com` |
| `CORS_ORIGINS` | `["http://localhost:5173","http://localhost:3000"]` | exact frontend origins only, for example `["https://app.example.com"]` |
| `REDIS_URL` | `redis://:<password>@redis:6379/0` in isolated local network | `rediss://:<password>@redis.internal:6379/0` |
| `REDIS_SECURITY_MODE` | `compat` or `auto` | `strict` |
| `REDIS_TLS_MODE` | `allow_insecure` or `auto` | `required` |
| `AUTH_LOGIN_FAILURE_LIMIT` | default `5` | tune to identity-protection policy and support requirements |
| `AUTH_LOGIN_FAILURE_WINDOW_SECONDS` | default `900` | tune to identity-protection policy and support requirements |
| `AUTH_LOGIN_LOCKOUT_BASE_SECONDS` | default `30` | tune to identity-protection policy and support requirements |
| `AUTH_LOGIN_LOCKOUT_MAX_SECONDS` | default `900` | tune to identity-protection policy and support requirements |
| `AUTH_COOKIE_DOMAIN` | empty (recommended; API always issues a host-only auth cookie) | optional parent domain only when you explicitly need a mirrored domain cookie, for example `docs.lan` |
| `AUTH_COOKIE_SAMESITE` | `auto` | `none` only for truly cross-site frontend/API deployments; keep `auto` for same-site subdomains such as `docs.lan` and `api.docs.lan` |
| `PROVIDER_BASE_URL_ALLOW_HTTP` | `true` only when intentionally testing local HTTP provider endpoints | `false` |
| `PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK` | `true` only for trusted local development targets | `false` |
| `PROVIDER_BASE_URL_ALLOWLIST` | allow needed test hosts | explicit production allowlist, for example `["api.openai.com"]` |
| `PROCESSING_LOG_STORE_MODEL_IO_TEXT` | `false` by default; temporary `true` only for controlled debugging | `false` |
| `PROCESSING_LOG_STORE_PAYLOAD_TEXT` | `false` by default; temporary `true` only for controlled debugging | `false` |
| `CONTENT_EXPORT_MAX_DOCUMENTS` | default `250` or lower based on host memory | tuned to production capacity |
| `CONTENT_EXPORT_MAX_TOTAL_BYTES` | default `52428800` (50 MiB) or lower | tuned to production capacity |
| `CONTENT_EXPORT_RATE_LIMIT_PER_MINUTE` | default `6` | tuned to API throughput and abuse model |
Frontend container runtime behavior: `PUBLIC_BASE_URL` must point to the backend API public URL, not the frontend URL.
- the container runs as non-root `node`
- `/app` is owned by `node` in `frontend/Dockerfile` so Vite can create runtime temp config files under `/app`
Frontend local commands: ## HTTPS Proxy Deployment Notes
```bash This application supports both:
cd frontend && npm run dev - local HTTP-only operation (no TLS termination in containers)
cd frontend && npm run build - HTTPS deployment behind a reverse proxy that handles TLS
cd frontend && npm run preview
```
## Settings Persistence Recommended LIVE pattern:
1. Proxy terminates TLS and forwards to `api` and `frontend` internal HTTP endpoints.
Application-level settings managed from the UI are persisted by backend settings service: 2. Keep container published ports bound to localhost or internal network.
- file path: `<STORAGE_ROOT>/settings.json` 3. Set `PUBLIC_BASE_URL` and `VITE_API_BASE` to final HTTPS URLs.
- endpoints: `/api/v1/settings`, `/api/v1/settings/reset`, `/api/v1/settings/handwriting` 4. Set `CORS_ORIGINS` to exact HTTPS frontend origins.
5. Credentialed CORS is enabled and constrained for cookie-based sessions with strict origin allowlists.
Settings include:
- upload defaults
- display options
- processing-log retention options (`keep_document_sessions`, `keep_unbound_entries`)
- provider configuration
- OCR, summary, and routing task settings
- predefined paths and tags
- handwriting-style clustering settings
Read sanitization is resilient to corrupt persisted provider rows. If a persisted provider entry fails URL validation, the entry is skipped and defaults are used when no valid provider remains. This prevents unrelated read endpoints from failing due to stale invalid provider data.
Retention settings are used by worker cleanup and by `POST /api/v1/processing/logs/trim` when trim query values are not provided.
## Security Controls ## Security Controls
- Privileged APIs are token-gated with bearer auth: - CORS uses explicit origin allowlist only; broad origin regex matching is removed.
- `documents` endpoints: user token or admin token - Worker Redis startup validates URL auth and TLS policy before consuming jobs.
- `settings` and `processing/logs` endpoints: admin token only - Provider API keys are encrypted at rest with standard AEAD (`cryptography` Fernet).
- Authentication fails closed when `ADMIN_API_TOKEN` is not configured. - legacy `enc-v1` payloads are read for backward compatibility
- Provider base URLs are validated on settings updates and before outbound model calls: - new writes use `enc-v2`
- allowlist enforcement (`PROVIDER_BASE_URL_ALLOWLIST`) - Processing logs default to metadata-only persistence.
- scheme restrictions (`https` by default) - Login endpoint applies escalating temporary lockout on repeated failed credentials using Redis-backed subject keys for username and source IP.
- local/private-network blocking and per-request DNS revalidation checks for outbound runtime calls - Markdown export enforces:
- Upload and archive safety guards are enforced: - max document count
- `POST /api/v1/documents/upload` requires `Content-Length` and enforces file-count, per-file size, and total request size limits - max total markdown bytes
- `OPTIONS /api/v1/documents/upload` CORS preflight is excluded from `Content-Length` enforcement - per-user Redis-backed rate limit
- ZIP member count, per-member uncompressed size, total decompressed size, and compression-ratio guards - spool-file streaming to avoid unbounded memory archives
- Processing logs redact sensitive payload and text fields, and trim endpoints enforce retention caps from runtime config. - User-role document access is owner-scoped for non-admin accounts.
- Compose hardening defaults:
- host ports bind to `127.0.0.1` unless `HOST_BIND_IP` override is set ## Frontend Runtime
- `api`, `worker`, and `frontend` drop all Linux capabilities and set `no-new-privileges`
- backend and frontend containers run as non-root users by default - Frontend no longer consumes `VITE_API_TOKEN`.
- Frontend image target is environment-driven:
- `APP_ENV=development` builds the `development` target and runs Vite dev server
- `APP_ENV=production` builds the `production` target and serves static assets through unprivileged Nginx
- Frontend Docker targets are selected from `APP_ENV`, so use `development` or `production` values.
- Production frontend Nginx uses non-root runtime plus `/tmp` temp-path configuration so it can run with container capability dropping enabled.
- Vite dev server host allowlist uses the union of:
- hostnames extracted from `CORS_ORIGINS`
- optional explicit hostnames from `VITE_ALLOWED_HOSTS`
- `VITE_ALLOWED_HOSTS` only affects development mode where Vite is running.
- API auth cookies support optional domain and SameSite configuration through `AUTH_COOKIE_DOMAIN` and `AUTH_COOKIE_SAMESITE`.
- HTTPS cookie security detection falls back to `PUBLIC_BASE_URL` scheme when proxy headers are missing.
- CSRF validation accepts header matches against any `dcm_csrf` cookie value in the request, covering stale plus fresh duplicate-cookie transitions.
- Session authentication is cookie-based; browser reloads and new tabs can reuse an active session until it expires or is revoked.
- Protected media and file download flows still use authenticated fetch plus blob/object URL handling.
## Validation Checklist ## Validation Checklist
After operational or configuration changes, verify: After configuration changes:
- `GET /api/v1/health` is healthy - `GET /api/v1/health` returns healthy response
- frontend can list, upload, and search documents - login succeeds for bootstrap admin user
- processing worker logs show successful task execution - admin can upload, search, open preview, download, and export markdown
- settings save or reset works and persists after restart - user account can only access its own documents
- admin-only settings and processing logs are not accessible by user role
- `docker compose logs -f api worker` shows no startup validation failures

View File

@@ -1,100 +1,147 @@
services: services:
storage-init:
build:
context: ./backend
user: "0:0"
command:
- "sh"
- "-c"
- >
mkdir -p /data/storage/originals /data/storage/derived/previews /data/storage/tmp &&
chown -R 10001:10001 /data/storage &&
chmod -R u+rwX,g+rwX /data/storage
volumes:
- ${DCM_DATA_DIR:-./data}/storage:/data/storage
restart: "no"
db: db:
image: postgres:16-alpine image: postgres:16-alpine
environment: environment:
POSTGRES_USER: dcm POSTGRES_USER: ${POSTGRES_USER:?POSTGRES_USER must be set}
POSTGRES_PASSWORD: dcm POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?POSTGRES_PASSWORD must be set}
POSTGRES_DB: dcm POSTGRES_DB: ${POSTGRES_DB:?POSTGRES_DB must be set}
ports:
- "${HOST_BIND_IP:-127.0.0.1}:5432:5432"
volumes: volumes:
- db-data:/var/lib/postgresql/data - ${DCM_DATA_DIR:-./data}/db-data:/var/lib/postgresql/data
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -U dcm -d dcm"] test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:?POSTGRES_USER must be set} -d ${POSTGRES_DB:?POSTGRES_DB must be set}"]
interval: 10s interval: 10s
timeout: 5s timeout: 5s
retries: 10 retries: 10
restart: unless-stopped
networks:
- internal
redis: redis:
image: redis:7-alpine image: redis:7-alpine
ports: command:
- "${HOST_BIND_IP:-127.0.0.1}:6379:6379" - "redis-server"
- "--appendonly"
- "yes"
- "--requirepass"
- "${REDIS_PASSWORD:?REDIS_PASSWORD must be set}"
volumes: volumes:
- redis-data:/data - ${DCM_DATA_DIR:-./data}/redis-data:/data
networks:
- internal
typesense: typesense:
image: typesense/typesense:29.0 image: typesense/typesense:30.2.rc6
command: command:
- "--data-dir=/data" - "--data-dir=/data"
- "--api-key=dcm-typesense-key" - "--api-key=${TYPESENSE_API_KEY:?TYPESENSE_API_KEY must be set}"
- "--enable-cors" - "--enable-cors"
ports:
- "${HOST_BIND_IP:-127.0.0.1}:8108:8108"
volumes: volumes:
- typesense-data:/data - ${DCM_DATA_DIR:-./data}/typesense-data:/data
restart: unless-stopped
networks:
- internal
api: api:
build: build:
context: ./backend context: ./backend
environment: environment:
APP_ENV: development APP_ENV: ${APP_ENV:-development}
DATABASE_URL: postgresql+psycopg://dcm:dcm@db:5432/dcm DATABASE_URL: ${DATABASE_URL:?DATABASE_URL must be set}
REDIS_URL: redis://redis:6379/0 REDIS_URL: ${REDIS_URL:?REDIS_URL must be set}
REDIS_SECURITY_MODE: ${REDIS_SECURITY_MODE:-auto}
REDIS_TLS_MODE: ${REDIS_TLS_MODE:-auto}
STORAGE_ROOT: /data/storage STORAGE_ROOT: /data/storage
ADMIN_API_TOKEN: ${ADMIN_API_TOKEN:?ADMIN_API_TOKEN must be set} AUTH_BOOTSTRAP_ADMIN_USERNAME: ${AUTH_BOOTSTRAP_ADMIN_USERNAME:?AUTH_BOOTSTRAP_ADMIN_USERNAME must be set}
USER_API_TOKEN: ${USER_API_TOKEN:?USER_API_TOKEN must be set} AUTH_BOOTSTRAP_ADMIN_PASSWORD: ${AUTH_BOOTSTRAP_ADMIN_PASSWORD:?AUTH_BOOTSTRAP_ADMIN_PASSWORD must be set}
PROVIDER_BASE_URL_ALLOWLIST: '${PROVIDER_BASE_URL_ALLOWLIST:-["api.openai.com"]}' AUTH_BOOTSTRAP_USER_USERNAME: ${AUTH_BOOTSTRAP_USER_USERNAME:-}
PROVIDER_BASE_URL_ALLOW_HTTP: ${PROVIDER_BASE_URL_ALLOW_HTTP:-false} AUTH_BOOTSTRAP_USER_PASSWORD: ${AUTH_BOOTSTRAP_USER_PASSWORD:-}
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK: ${PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK:-false} AUTH_LOGIN_FAILURE_LIMIT: ${AUTH_LOGIN_FAILURE_LIMIT:-5}
AUTH_LOGIN_FAILURE_WINDOW_SECONDS: ${AUTH_LOGIN_FAILURE_WINDOW_SECONDS:-900}
AUTH_LOGIN_LOCKOUT_BASE_SECONDS: ${AUTH_LOGIN_LOCKOUT_BASE_SECONDS:-30}
AUTH_LOGIN_LOCKOUT_MAX_SECONDS: ${AUTH_LOGIN_LOCKOUT_MAX_SECONDS:-900}
APP_SETTINGS_ENCRYPTION_KEY: ${APP_SETTINGS_ENCRYPTION_KEY:?APP_SETTINGS_ENCRYPTION_KEY must be set}
PROVIDER_BASE_URL_ALLOWLIST: '${PROVIDER_BASE_URL_ALLOWLIST:-[]}'
PROVIDER_BASE_URL_ALLOW_HTTP: ${PROVIDER_BASE_URL_ALLOW_HTTP:-true}
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK: ${PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK:-true}
PROCESSING_LOG_STORE_MODEL_IO_TEXT: ${PROCESSING_LOG_STORE_MODEL_IO_TEXT:-false}
PROCESSING_LOG_STORE_PAYLOAD_TEXT: ${PROCESSING_LOG_STORE_PAYLOAD_TEXT:-false}
CONTENT_EXPORT_MAX_DOCUMENTS: ${CONTENT_EXPORT_MAX_DOCUMENTS:-250}
CONTENT_EXPORT_MAX_TOTAL_BYTES: ${CONTENT_EXPORT_MAX_TOTAL_BYTES:-52428800}
CONTENT_EXPORT_RATE_LIMIT_PER_MINUTE: ${CONTENT_EXPORT_RATE_LIMIT_PER_MINUTE:-6}
OCR_LANGUAGES: eng,deu OCR_LANGUAGES: eng,deu
PUBLIC_BASE_URL: ${PUBLIC_BASE_URL:-http://localhost:8000} PUBLIC_BASE_URL: ${PUBLIC_BASE_URL:-http://localhost:8000}
CORS_ORIGINS: '${CORS_ORIGINS:-["http://localhost:5173","http://localhost:3000"]}' CORS_ORIGINS: '${CORS_ORIGINS:-["http://localhost:5173","http://localhost:3000"]}'
TYPESENSE_PROTOCOL: http TYPESENSE_PROTOCOL: http
TYPESENSE_HOST: typesense TYPESENSE_HOST: typesense
TYPESENSE_PORT: 8108 TYPESENSE_PORT: 8108
TYPESENSE_API_KEY: dcm-typesense-key TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:?TYPESENSE_API_KEY must be set}
TYPESENSE_COLLECTION_NAME: documents TYPESENSE_COLLECTION_NAME: documents
ports: # ports:
- "${HOST_BIND_IP:-127.0.0.1}:8000:8000" # - "${HOST_BIND_IP:-127.0.0.1}:8000:8000"
security_opt: security_opt:
- no-new-privileges:true - no-new-privileges:true
cap_drop: cap_drop:
- ALL - ALL
volumes: volumes:
- ./backend/app:/app/app - ./backend/app:/app/app
- dcm-storage:/data - ${DCM_DATA_DIR:-./data}/storage:/data/storage
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy
redis: redis:
condition: service_started condition: service_started
storage-init:
condition: service_completed_successfully
typesense: typesense:
condition: service_started condition: service_started
networks:
npm_proxy:
ipv4_address: 192.168.98.41
internal:
restart: unless-stopped
worker: worker:
build: build:
context: ./backend context: ./backend
command: ["rq", "worker", "dcm", "--url", "redis://redis:6379/0"] command: ["python", "-m", "app.worker.run_worker"]
environment: environment:
APP_ENV: development APP_ENV: ${APP_ENV:-development}
DATABASE_URL: postgresql+psycopg://dcm:dcm@db:5432/dcm DATABASE_URL: ${DATABASE_URL:?DATABASE_URL must be set}
REDIS_URL: redis://redis:6379/0 REDIS_URL: ${REDIS_URL:?REDIS_URL must be set}
REDIS_SECURITY_MODE: ${REDIS_SECURITY_MODE:-auto}
REDIS_TLS_MODE: ${REDIS_TLS_MODE:-auto}
STORAGE_ROOT: /data/storage STORAGE_ROOT: /data/storage
ADMIN_API_TOKEN: ${ADMIN_API_TOKEN:?ADMIN_API_TOKEN must be set} APP_SETTINGS_ENCRYPTION_KEY: ${APP_SETTINGS_ENCRYPTION_KEY:?APP_SETTINGS_ENCRYPTION_KEY must be set}
USER_API_TOKEN: ${USER_API_TOKEN:?USER_API_TOKEN must be set} PROVIDER_BASE_URL_ALLOWLIST: '${PROVIDER_BASE_URL_ALLOWLIST:-[]}'
PROVIDER_BASE_URL_ALLOWLIST: '${PROVIDER_BASE_URL_ALLOWLIST:-["api.openai.com"]}' PROVIDER_BASE_URL_ALLOW_HTTP: ${PROVIDER_BASE_URL_ALLOW_HTTP:-true}
PROVIDER_BASE_URL_ALLOW_HTTP: ${PROVIDER_BASE_URL_ALLOW_HTTP:-false} PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK: ${PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK:-true}
PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK: ${PROVIDER_BASE_URL_ALLOW_PRIVATE_NETWORK:-false} PROCESSING_LOG_STORE_MODEL_IO_TEXT: ${PROCESSING_LOG_STORE_MODEL_IO_TEXT:-false}
PROCESSING_LOG_STORE_PAYLOAD_TEXT: ${PROCESSING_LOG_STORE_PAYLOAD_TEXT:-false}
OCR_LANGUAGES: eng,deu OCR_LANGUAGES: eng,deu
PUBLIC_BASE_URL: http://localhost:8000 PUBLIC_BASE_URL: ${PUBLIC_BASE_URL:-http://localhost:8000}
TYPESENSE_PROTOCOL: http TYPESENSE_PROTOCOL: http
TYPESENSE_HOST: typesense TYPESENSE_HOST: typesense
TYPESENSE_PORT: 8108 TYPESENSE_PORT: 8108
TYPESENSE_API_KEY: dcm-typesense-key TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:?TYPESENSE_API_KEY must be set}
TYPESENSE_COLLECTION_NAME: documents TYPESENSE_COLLECTION_NAME: documents
volumes: volumes:
- ./backend/app:/app/app - ./backend/app:/app/app
- dcm-storage:/data - ${DCM_DATA_DIR:-./data}/storage:/data/storage
security_opt: security_opt:
- no-new-privileges:true - no-new-privileges:true
cap_drop: cap_drop:
@@ -104,17 +151,27 @@ services:
condition: service_healthy condition: service_healthy
redis: redis:
condition: service_started condition: service_started
storage-init:
condition: service_completed_successfully
typesense: typesense:
condition: service_started condition: service_started
restart: unless-stopped
networks:
- internal
frontend: frontend:
build: build:
context: ./frontend context: ./frontend
target: ${APP_ENV:-development}
args:
VITE_API_BASE: ${VITE_API_BASE:-}
environment: environment:
VITE_API_BASE: ${VITE_API_BASE:-http://localhost:8000/api/v1} VITE_API_BASE: ${VITE_API_BASE:-}
VITE_API_TOKEN: ${VITE_API_TOKEN:-${USER_API_TOKEN:-}} VITE_API_PROXY_TARGET: ${VITE_API_PROXY_TARGET:-http://api:8000}
ports: CORS_ORIGINS: '${CORS_ORIGINS:-["http://localhost:5173","http://localhost:3000"]}'
- "${HOST_BIND_IP:-127.0.0.1}:5173:5173" VITE_ALLOWED_HOSTS: ${VITE_ALLOWED_HOSTS:-}
# ports:
# - "${HOST_BIND_IP:-127.0.0.1}:5173:5173"
volumes: volumes:
- ./frontend/src:/app/src - ./frontend/src:/app/src
- ./frontend/index.html:/app/index.html - ./frontend/index.html:/app/index.html
@@ -126,9 +183,14 @@ services:
- no-new-privileges:true - no-new-privileges:true
cap_drop: cap_drop:
- ALL - ALL
networks:
npm_proxy:
ipv4_address: 192.168.98.40
internal:
restart: unless-stopped
volumes: networks:
db-data: internal:
redis-data: driver: bridge
dcm-storage: npm_proxy:
typesense-data: external: true

View File

@@ -1,11 +1,11 @@
FROM node:22-alpine FROM node:20-slim AS base
WORKDIR /app WORKDIR /app
COPY package.json /app/package.json COPY package.json /app/package.json
COPY package-lock.json /app/package-lock.json COPY package-lock.json /app/package-lock.json
RUN npm ci RUN npm ci --no-audit \
RUN chown -R node:node /app && chown -R node:node /app
COPY --chown=node:node tsconfig.json /app/tsconfig.json COPY --chown=node:node tsconfig.json /app/tsconfig.json
COPY --chown=node:node tsconfig.node.json /app/tsconfig.node.json COPY --chown=node:node tsconfig.node.json /app/tsconfig.node.json
@@ -13,8 +13,32 @@ COPY --chown=node:node vite.config.ts /app/vite.config.ts
COPY --chown=node:node index.html /app/index.html COPY --chown=node:node index.html /app/index.html
COPY --chown=node:node src /app/src COPY --chown=node:node src /app/src
FROM base AS development
EXPOSE 5173 EXPOSE 5173
USER node USER node
CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0", "--port", "5173"] CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0", "--port", "5173"]
FROM base AS build
ARG VITE_API_BASE=
ENV VITE_API_BASE=${VITE_API_BASE}
RUN npm run build
FROM nginx:1.27-alpine AS production
COPY nginx-main.conf /etc/nginx/nginx.conf
COPY nginx.conf /etc/nginx/conf.d/default.conf
COPY --from=build /app/dist /usr/share/nginx/html
RUN mkdir -p /tmp/client_temp /tmp/proxy_temp /tmp/fastcgi_temp /tmp/uwsgi_temp /tmp/scgi_temp \
&& chown -R 101:101 /tmp /var/log/nginx /usr/share/nginx/html
EXPOSE 5173
USER 101:101
ENTRYPOINT ["nginx"]
CMD ["-g", "daemon off;"]

22
frontend/nginx-main.conf Normal file
View File

@@ -0,0 +1,22 @@
worker_processes auto;
pid /tmp/nginx.pid;
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
sendfile on;
keepalive_timeout 65;
client_body_temp_path /tmp/client_temp;
proxy_temp_path /tmp/proxy_temp;
fastcgi_temp_path /tmp/fastcgi_temp;
uwsgi_temp_path /tmp/uwsgi_temp;
scgi_temp_path /tmp/scgi_temp;
include /etc/nginx/conf.d/*.conf;
}

22
frontend/nginx.conf Normal file
View File

@@ -0,0 +1,22 @@
server {
listen 5173;
listen [::]:5173;
server_name _;
client_max_body_size 100m;
root /usr/share/nginx/html;
index index.html;
location /api/ {
proxy_pass http://api:8000;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header X-Real-IP $remote_addr;
}
location / {
try_files $uri $uri/ /index.html;
}
}

View File

@@ -3,9 +3,11 @@
*/ */
import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
import type { JSX } from 'react'; import type { JSX } from 'react';
import { LogOut, User } from 'lucide-react';
import ActionModal from './components/ActionModal'; import ActionModal from './components/ActionModal';
import DocumentGrid from './components/DocumentGrid'; import DocumentGrid from './components/DocumentGrid';
import LoginScreen from './components/LoginScreen';
import DocumentViewer from './components/DocumentViewer'; import DocumentViewer from './components/DocumentViewer';
import PathInput from './components/PathInput'; import PathInput from './components/PathInput';
import ProcessingLogPanel from './components/ProcessingLogPanel'; import ProcessingLogPanel from './components/ProcessingLogPanel';
@@ -17,22 +19,26 @@ import {
downloadBlobFile, downloadBlobFile,
deleteDocument, deleteDocument,
exportContentsMarkdown, exportContentsMarkdown,
getCurrentAuthSession,
getAppSettings, getAppSettings,
listDocuments, listDocuments,
listPaths, listPaths,
listProcessingLogs, listProcessingLogs,
listTags, listTags,
listTypes, listTypes,
loginWithPassword,
logoutCurrentSession,
resetAppSettings, resetAppSettings,
searchDocuments, searchDocuments,
trashDocument, trashDocument,
updateAppSettings, updateAppSettings,
uploadDocuments, uploadDocuments,
} from './lib/api'; } from './lib/api';
import type { AppSettings, AppSettingsUpdate, DmsDocument, ProcessingLogEntry } from './types'; import type { AppSettings, AppSettingsUpdate, AuthUser, DmsDocument, ProcessingLogEntry } from './types';
type AppScreen = 'documents' | 'settings'; type AppScreen = 'documents' | 'settings';
type DocumentView = 'active' | 'trash'; type DocumentView = 'active' | 'trash';
type AuthPhase = 'checking' | 'unauthenticated' | 'authenticated';
interface DialogOption { interface DialogOption {
key: string; key: string;
@@ -51,6 +57,10 @@ interface DialogState {
*/ */
export default function App(): JSX.Element { export default function App(): JSX.Element {
const DEFAULT_PAGE_SIZE = 12; const DEFAULT_PAGE_SIZE = 12;
const [authPhase, setAuthPhase] = useState<AuthPhase>('checking');
const [authUser, setAuthUser] = useState<AuthUser | null>(null);
const [authError, setAuthError] = useState<string | null>(null);
const [isAuthenticating, setIsAuthenticating] = useState<boolean>(false);
const [screen, setScreen] = useState<AppScreen>('documents'); const [screen, setScreen] = useState<AppScreen>('documents');
const [documentView, setDocumentView] = useState<DocumentView>('active'); const [documentView, setDocumentView] = useState<DocumentView>('active');
const [documents, setDocuments] = useState<DmsDocument[]>([]); const [documents, setDocuments] = useState<DmsDocument[]>([]);
@@ -82,6 +92,7 @@ export default function App(): JSX.Element {
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
const [dialogState, setDialogState] = useState<DialogState | null>(null); const [dialogState, setDialogState] = useState<DialogState | null>(null);
const dialogResolverRef = useRef<((value: string) => void) | null>(null); const dialogResolverRef = useRef<((value: string) => void) | null>(null);
const isAdmin = authUser?.role === 'admin';
const pageSize = useMemo(() => { const pageSize = useMemo(() => {
const configured = appSettings?.display?.cards_per_page; const configured = appSettings?.display?.cards_per_page;
@@ -118,6 +129,71 @@ export default function App(): JSX.Element {
} }
}, []); }, []);
/**
* Clears workspace state when authentication context changes or session is revoked.
*/
const resetApplicationState = useCallback((): void => {
setScreen('documents');
setDocumentView('active');
setDocuments([]);
setTotalDocuments(0);
setCurrentPage(1);
setSearchText('');
setActiveSearchQuery('');
setSelectedDocumentId(null);
setSelectedDocumentIds([]);
setExportPathInput('');
setTagFilter('');
setTypeFilter('');
setPathFilter('');
setProcessedFrom('');
setProcessedTo('');
setKnownTags([]);
setKnownPaths([]);
setKnownTypes([]);
setAppSettings(null);
setSettingsSaveAction(null);
setProcessingLogs([]);
setProcessingLogError(null);
setError(null);
}, []);
/**
* Exchanges submitted credentials for a server-issued session and activates the app shell.
*/
const handleLogin = useCallback(async (username: string, password: string): Promise<void> => {
setIsAuthenticating(true);
setAuthError(null);
try {
const payload = await loginWithPassword(username, password);
setAuthUser(payload.user);
setAuthPhase('authenticated');
setError(null);
} catch (caughtError) {
const message = caughtError instanceof Error ? caughtError.message : 'Login failed';
setAuthError(message);
setAuthUser(null);
setAuthPhase('unauthenticated');
resetApplicationState();
} finally {
setIsAuthenticating(false);
}
}, [resetApplicationState]);
/**
* Revokes current session server-side when possible and always clears local auth state.
*/
const handleLogout = useCallback(async (): Promise<void> => {
setError(null);
try {
await logoutCurrentSession();
} catch {}
setAuthUser(null);
setAuthError(null);
setAuthPhase('unauthenticated');
resetApplicationState();
}, [resetApplicationState]);
const loadCatalogs = useCallback(async (): Promise<void> => { const loadCatalogs = useCallback(async (): Promise<void> => {
const [tags, paths, types] = await Promise.all([listTags(true), listPaths(true), listTypes(true)]); const [tags, paths, types] = await Promise.all([listTags(true), listPaths(true), listTypes(true)]);
setKnownTags(tags); setKnownTags(tags);
@@ -185,6 +261,10 @@ export default function App(): JSX.Element {
]); ]);
const loadSettings = useCallback(async (): Promise<void> => { const loadSettings = useCallback(async (): Promise<void> => {
if (!isAdmin) {
setAppSettings(null);
return;
}
setError(null); setError(null);
try { try {
const payload = await getAppSettings(); const payload = await getAppSettings();
@@ -192,9 +272,14 @@ export default function App(): JSX.Element {
} catch (caughtError) { } catch (caughtError) {
setError(caughtError instanceof Error ? caughtError.message : 'Failed to load settings'); setError(caughtError instanceof Error ? caughtError.message : 'Failed to load settings');
} }
}, []); }, [isAdmin]);
const loadProcessingTimeline = useCallback(async (options?: { silent?: boolean }): Promise<void> => { const loadProcessingTimeline = useCallback(async (options?: { silent?: boolean }): Promise<void> => {
if (!isAdmin) {
setProcessingLogs([]);
setProcessingLogError(null);
return;
}
const silent = options?.silent ?? false; const silent = options?.silent ?? false;
if (!silent) { if (!silent) {
setIsLoadingLogs(true); setIsLoadingLogs(true);
@@ -210,18 +295,44 @@ export default function App(): JSX.Element {
setIsLoadingLogs(false); setIsLoadingLogs(false);
} }
} }
}, []); }, [isAdmin]);
useEffect(() => { useEffect(() => {
const resolveSession = async (): Promise<void> => {
try {
const sessionPayload = await getCurrentAuthSession();
setAuthUser(sessionPayload.user);
setAuthError(null);
setAuthPhase('authenticated');
} catch {
setAuthUser(null);
setAuthPhase('unauthenticated');
resetApplicationState();
}
};
void resolveSession();
}, [resetApplicationState]);
useEffect(() => {
if (authPhase !== 'authenticated') {
return;
}
const bootstrap = async (): Promise<void> => { const bootstrap = async (): Promise<void> => {
try { try {
if (isAdmin) {
await Promise.all([loadDocuments(), loadCatalogs(), loadSettings(), loadProcessingTimeline()]); await Promise.all([loadDocuments(), loadCatalogs(), loadSettings(), loadProcessingTimeline()]);
return;
}
await Promise.all([loadDocuments(), loadCatalogs()]);
setAppSettings(null);
setProcessingLogs([]);
setProcessingLogError(null);
} catch (caughtError) { } catch (caughtError) {
setError(caughtError instanceof Error ? caughtError.message : 'Failed to initialize application'); setError(caughtError instanceof Error ? caughtError.message : 'Failed to initialize application');
} }
}; };
void bootstrap(); void bootstrap();
}, [loadCatalogs, loadDocuments, loadProcessingTimeline, loadSettings]); }, [authPhase, isAdmin, loadCatalogs, loadDocuments, loadProcessingTimeline, loadSettings]);
useEffect(() => { useEffect(() => {
setSelectedDocumentIds([]); setSelectedDocumentIds([]);
@@ -229,13 +340,25 @@ export default function App(): JSX.Element {
}, [documentView, pageSize]); }, [documentView, pageSize]);
useEffect(() => { useEffect(() => {
if (!isAdmin && screen === 'settings') {
setScreen('documents');
}
}, [isAdmin, screen]);
useEffect(() => {
if (authPhase !== 'authenticated') {
return;
}
if (screen !== 'documents') { if (screen !== 'documents') {
return; return;
} }
void loadDocuments(); void loadDocuments();
}, [loadDocuments, screen]); }, [authPhase, loadDocuments, screen]);
useEffect(() => { useEffect(() => {
if (authPhase !== 'authenticated') {
return;
}
if (screen !== 'documents') { if (screen !== 'documents') {
return; return;
} }
@@ -243,9 +366,12 @@ export default function App(): JSX.Element {
void loadDocuments({ silent: true }); void loadDocuments({ silent: true });
}, 3000); }, 3000);
return () => window.clearInterval(pollInterval); return () => window.clearInterval(pollInterval);
}, [loadDocuments, screen]); }, [authPhase, loadDocuments, screen]);
useEffect(() => { useEffect(() => {
if (authPhase !== 'authenticated' || !isAdmin) {
return;
}
if (screen !== 'documents') { if (screen !== 'documents') {
return; return;
} }
@@ -254,7 +380,7 @@ export default function App(): JSX.Element {
void loadProcessingTimeline({ silent: true }); void loadProcessingTimeline({ silent: true });
}, 1500); }, 1500);
return () => window.clearInterval(pollInterval); return () => window.clearInterval(pollInterval);
}, [loadProcessingTimeline, screen]); }, [authPhase, isAdmin, loadProcessingTimeline, screen]);
const selectedDocument = useMemo( const selectedDocument = useMemo(
() => documents.find((document) => document.id === selectedDocumentId) ?? null, () => documents.find((document) => document.id === selectedDocumentId) ?? null,
@@ -299,13 +425,17 @@ export default function App(): JSX.Element {
}); });
} }
if (isAdmin) {
await Promise.all([loadDocuments(), loadCatalogs(), loadProcessingTimeline()]); await Promise.all([loadDocuments(), loadCatalogs(), loadProcessingTimeline()]);
} else {
await Promise.all([loadDocuments(), loadCatalogs()]);
}
} catch (caughtError) { } catch (caughtError) {
setError(caughtError instanceof Error ? caughtError.message : 'Upload failed'); setError(caughtError instanceof Error ? caughtError.message : 'Upload failed');
} finally { } finally {
setIsUploading(false); setIsUploading(false);
} }
}, [appSettings, loadCatalogs, loadDocuments, loadProcessingTimeline, presentDialog]); }, [appSettings, isAdmin, loadCatalogs, loadDocuments, loadProcessingTimeline, presentDialog]);
const handleSearch = useCallback(async (): Promise<void> => { const handleSearch = useCallback(async (): Promise<void> => {
setSelectedDocumentIds([]); setSelectedDocumentIds([]);
@@ -579,14 +709,35 @@ export default function App(): JSX.Element {
setCurrentPage(1); setCurrentPage(1);
}, []); }, []);
if (authPhase === 'checking') {
return (
<main className="auth-shell">
<section className="auth-card">
<h1>LedgerDock</h1>
<p>Checking current session...</p>
</section>
</main>
);
}
if (authPhase !== 'authenticated') {
return <LoginScreen error={authError} isSubmitting={isAuthenticating} onSubmit={handleLogin} />;
}
return ( return (
<main className="app-shell"> <main className="app-shell">
<header className="topbar"> <header className="topbar">
<div> <div className="topbar-inner">
<div className="topbar-brand">
<h1>LedgerDock</h1> <h1>LedgerDock</h1>
<p>Document command deck for OCR, routing intelligence, and controlled metadata ops.</p> <p>Document command deck for OCR, routing intelligence, and controlled metadata ops.</p>
<p className="topbar-auth-status">
<User className="topbar-user-icon" aria-hidden="true" />
You are currently signed in as <span className="topbar-current-username">{authUser?.username}</span>
</p>
</div> </div>
<div className="topbar-controls"> <div className="topbar-controls">
<div className="topbar-primary-row">
<div className="topbar-nav-group"> <div className="topbar-nav-group">
<button <button
type="button" type="button"
@@ -608,6 +759,7 @@ export default function App(): JSX.Element {
> >
Trash Trash
</button> </button>
{isAdmin && (
<button <button
type="button" type="button"
className={screen === 'settings' ? 'active-view-button' : 'secondary-action'} className={screen === 'settings' ? 'active-view-button' : 'secondary-action'}
@@ -615,6 +767,16 @@ export default function App(): JSX.Element {
> >
Settings Settings
</button> </button>
)}
</div>
<button
type="button"
className="secondary-action topbar-icon-action"
onClick={() => void handleLogout()}
aria-label="Sign out"
>
<LogOut className="topbar-signout-icon" aria-hidden="true" />
</button>
</div> </div>
{screen === 'documents' && ( {screen === 'documents' && (
@@ -623,7 +785,7 @@ export default function App(): JSX.Element {
</div> </div>
)} )}
{screen === 'settings' && ( {screen === 'settings' && isAdmin && (
<div className="topbar-settings-group"> <div className="topbar-settings-group">
<button type="button" className="secondary-action" onClick={() => void handleResetSettings()} disabled={isSavingSettings}> <button type="button" className="secondary-action" onClick={() => void handleResetSettings()} disabled={isSavingSettings}>
Reset To Defaults Reset To Defaults
@@ -634,11 +796,12 @@ export default function App(): JSX.Element {
</div> </div>
)} )}
</div> </div>
</div>
</header> </header>
{error && <p className="error-banner">{error}</p>} {error && <p className="error-banner">{error}</p>}
{screen === 'settings' && ( {screen === 'settings' && isAdmin && (
<SettingsScreen <SettingsScreen
settings={appSettings} settings={appSettings}
isSaving={isSavingSettings} isSaving={isSavingSettings}
@@ -762,7 +925,8 @@ export default function App(): JSX.Element {
requestConfirmation={requestConfirmation} requestConfirmation={requestConfirmation}
/> />
</section> </section>
{processingLogError && <p className="error-banner">{processingLogError}</p>} {isAdmin && processingLogError && <p className="error-banner">{processingLogError}</p>}
{isAdmin && (
<ProcessingLogPanel <ProcessingLogPanel
entries={processingLogs} entries={processingLogs}
isLoading={isLoadingLogs} isLoading={isLoadingLogs}
@@ -772,6 +936,7 @@ export default function App(): JSX.Element {
typingAnimationEnabled={typingAnimationEnabled} typingAnimationEnabled={typingAnimationEnabled}
onClear={() => void handleClearProcessingLogs()} onClear={() => void handleClearProcessingLogs()}
/> />
)}
</> </>
)} )}

View File

@@ -19,6 +19,47 @@ import type { DmsDocument, DmsDocumentDetail } from '../types';
import PathInput from './PathInput'; import PathInput from './PathInput';
import TagInput from './TagInput'; import TagInput from './TagInput';
const SAFE_IMAGE_PREVIEW_MIME_TYPES = new Set<string>([
'image/bmp',
'image/gif',
'image/jpeg',
'image/jpg',
'image/png',
'image/webp',
]);
const SAFE_IFRAME_PREVIEW_MIME_TYPES = new Set<string>([
'application/json',
'application/pdf',
'text/csv',
'text/markdown',
'text/plain',
]);
/**
* Normalizes MIME values by stripping parameters and lowercasing for stable comparison.
*/
function normalizeMimeType(mimeType: string | null | undefined): string {
if (!mimeType) {
return '';
}
return mimeType.split(';')[0]?.trim().toLowerCase() ?? '';
}
/**
* Resolves whether a MIME type is safe to render as an image preview.
*/
function isSafeImagePreviewMimeType(mimeType: string): boolean {
return SAFE_IMAGE_PREVIEW_MIME_TYPES.has(mimeType);
}
/**
* Resolves whether a MIME type is safe to render inside a sandboxed iframe preview.
*/
function isSafeIframePreviewMimeType(mimeType: string): boolean {
return SAFE_IFRAME_PREVIEW_MIME_TYPES.has(mimeType);
}
/** /**
* Defines props for the selected document viewer panel. * Defines props for the selected document viewer panel.
*/ */
@@ -60,6 +101,30 @@ export default function DocumentViewer({
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
const previewObjectUrlRef = useRef<string | null>(null); const previewObjectUrlRef = useRef<string | null>(null);
/**
* Resolves normalized MIME type used by preview safety checks.
*/
const previewMimeType = useMemo(() => normalizeMimeType(document?.mime_type), [document?.mime_type]);
/**
* Resolves whether selected document should render as a safe image element in preview.
*/
const isImageDocument = useMemo(() => {
return isSafeImagePreviewMimeType(previewMimeType);
}, [previewMimeType]);
/**
* Resolves whether selected document should render in sandboxed iframe preview.
*/
const canRenderIframePreview = useMemo(() => {
return isSafeIframePreviewMimeType(previewMimeType);
}, [previewMimeType]);
/**
* Resolves whether selected document supports any inline preview mode.
*/
const canRenderInlinePreview = isImageDocument || canRenderIframePreview;
/** /**
* Syncs editable metadata fields whenever selection changes. * Syncs editable metadata fields whenever selection changes.
*/ */
@@ -100,6 +165,12 @@ export default function DocumentViewer({
setIsLoadingPreview(false); setIsLoadingPreview(false);
return; return;
} }
if (!canRenderInlinePreview) {
revokePreviewObjectUrl();
setPreviewObjectUrl(null);
setIsLoadingPreview(false);
return;
}
let cancelled = false; let cancelled = false;
setIsLoadingPreview(true); setIsLoadingPreview(true);
@@ -131,7 +202,7 @@ export default function DocumentViewer({
cancelled = true; cancelled = true;
revokePreviewObjectUrl(); revokePreviewObjectUrl();
}; };
}, [document?.id]); }, [document?.id, canRenderInlinePreview]);
/** /**
* Refreshes editable metadata from list updates only while form is clean. * Refreshes editable metadata from list updates only while form is clean.
@@ -183,16 +254,6 @@ export default function DocumentViewer({
}; };
}, [document?.id]); }, [document?.id]);
/**
* Resolves whether selected document should render as an image element in preview.
*/
const isImageDocument = useMemo(() => {
if (!document) {
return false;
}
return document.mime_type.startsWith('image/');
}, [document]);
/** /**
* Extracts provider/transcription errors from document metadata for user visibility. * Extracts provider/transcription errors from document metadata for user visibility.
*/ */
@@ -482,11 +543,22 @@ export default function DocumentViewer({
{previewObjectUrl ? ( {previewObjectUrl ? (
isImageDocument ? ( isImageDocument ? (
<img src={previewObjectUrl} alt={document.original_filename} /> <img src={previewObjectUrl} alt={document.original_filename} />
) : canRenderIframePreview ? (
<iframe
src={previewObjectUrl}
title={document.original_filename}
sandbox=""
referrerPolicy="no-referrer"
allow="clipboard-read 'none'; clipboard-write 'none'; geolocation 'none'; microphone 'none'; camera 'none'; payment 'none'; usb 'none'; fullscreen 'none'"
loading="lazy"
/>
) : ( ) : (
<iframe src={previewObjectUrl} title={document.original_filename} /> <p className="small">Preview blocked for this file type. Download to inspect safely.</p>
) )
) : isLoadingPreview ? ( ) : isLoadingPreview ? (
<p className="small">Loading preview...</p> <p className="small">Loading preview...</p>
) : !canRenderInlinePreview ? (
<p className="small">Preview blocked for this file type. Download to inspect safely.</p>
) : ( ) : (
<p className="small">Preview unavailable for this document.</p> <p className="small">Preview unavailable for this document.</p>
)} )}

View File

@@ -0,0 +1,71 @@
/**
* Login screen for session-based authentication before loading protected application views.
*/
import { FormEvent, useState } from 'react';
import type { JSX } from 'react';
interface LoginScreenProps {
error: string | null;
isSubmitting: boolean;
onSubmit: (username: string, password: string) => Promise<void>;
}
/**
* Renders credential form used to issue per-user API sessions.
*/
export default function LoginScreen({
error,
isSubmitting,
onSubmit,
}: LoginScreenProps): JSX.Element {
const [username, setUsername] = useState<string>('');
const [password, setPassword] = useState<string>('');
/**
* Submits credentials and leaves result handling to parent application orchestration.
*/
const handleSubmit = (event: FormEvent<HTMLFormElement>): void => {
event.preventDefault();
if (isSubmitting) {
return;
}
void onSubmit(username, password);
};
return (
<main className="auth-shell">
<section className="auth-card">
<h1>LedgerDock</h1>
<p>Sign in with your account to access documents and role-scoped controls.</p>
<form onSubmit={handleSubmit} className="auth-form">
<label>
Username
<input
type="text"
value={username}
onChange={(event) => setUsername(event.target.value)}
autoComplete="username"
required
disabled={isSubmitting}
/>
</label>
<label>
Password
<input
type="password"
value={password}
onChange={(event) => setPassword(event.target.value)}
autoComplete="current-password"
required
disabled={isSubmitting}
/>
</label>
<button type="submit" disabled={isSubmitting}>
{isSubmitting ? 'Signing In...' : 'Sign In'}
</button>
</form>
{error && <p className="error-banner">{error}</p>}
</section>
</main>
);
}

View File

@@ -1,5 +1,14 @@
// @ts-expect-error Node strip-types runtime requires explicit .ts extension in ESM imports. // @ts-ignore Node strip-types runtime requires explicit .ts extension in ESM imports.
import { downloadDocumentContentMarkdown, downloadDocumentFile, getDocumentPreviewBlob, getDocumentThumbnailBlob } from './api.ts'; import {
downloadDocumentContentMarkdown,
downloadDocumentFile,
getCurrentAuthSession,
getDocumentPreviewBlob,
getDocumentThumbnailBlob,
loginWithPassword,
logoutCurrentSession,
updateDocumentMetadata,
} from './api.ts';
/** /**
* Throws when a test condition is false. * Throws when a test condition is false.
@@ -25,15 +34,35 @@ async function assertRejects(action: () => Promise<unknown>, expectedMessage: st
} }
/** /**
* Runs API helper tests for authenticated media and download flows. * Converts fetch inputs into a URL string for assertions.
*/
function toRequestUrl(input: RequestInfo | URL): string {
if (typeof input === 'string') {
return input;
}
if (input instanceof URL) {
return input.toString();
}
return input.url;
}
/**
* Runs API helper tests for authenticated media and auth session workflows.
*/ */
async function runApiTests(): Promise<void> { async function runApiTests(): Promise<void> {
const originalFetch = globalThis.fetch; const originalFetch = globalThis.fetch;
const globalWithDocument = globalThis as typeof globalThis & { document?: { cookie?: string } };
const originalDocument = globalWithDocument.document;
try { try {
const requestUrls: string[] = []; const requestUrls: string[] = [];
globalThis.fetch = (async (input: RequestInfo | URL): Promise<Response> => { const requestAuthHeaders: Array<string | null> = [];
requestUrls.push(typeof input === 'string' ? input : input.toString()); const requestCsrfHeaders: Array<string | null> = [];
globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit): Promise<Response> => {
requestUrls.push(toRequestUrl(input));
const normalizedHeaders = new Headers(init?.headers);
requestAuthHeaders.push(normalizedHeaders.get('Authorization'));
requestCsrfHeaders.push(normalizedHeaders.get('x-csrf-token'));
return new Response('preview-bytes', { status: 200 }); return new Response('preview-bytes', { status: 200 });
}) as typeof fetch; }) as typeof fetch;
@@ -50,6 +79,68 @@ async function runApiTests(): Promise<void> {
requestUrls[1] === 'http://localhost:8000/api/v1/documents/doc-1/preview', requestUrls[1] === 'http://localhost:8000/api/v1/documents/doc-1/preview',
`Unexpected preview URL ${requestUrls[1]}`, `Unexpected preview URL ${requestUrls[1]}`,
); );
assert(requestAuthHeaders[0] === null, `Expected no auth header for thumbnail request, got "${requestAuthHeaders[0]}"`);
assert(requestAuthHeaders[1] === null, `Expected no auth header for preview request, got "${requestAuthHeaders[1]}"`);
assert(requestCsrfHeaders[0] === null, `Expected no CSRF header for thumbnail request, got "${requestCsrfHeaders[0]}"`);
assert(requestCsrfHeaders[1] === null, `Expected no CSRF header for preview request, got "${requestCsrfHeaders[1]}"`);
globalWithDocument.document = {
cookie: 'dcm_csrf=csrf-session-token',
};
let metadataCsrfHeader: string | null = null;
let metadataContentType: string | null = null;
let metadataAuthHeader: string | null = null;
globalThis.fetch = (async (_input: RequestInfo | URL, init?: RequestInit): Promise<Response> => {
const headers = new Headers(init?.headers);
metadataCsrfHeader = headers.get('x-csrf-token');
metadataAuthHeader = headers.get('Authorization');
metadataContentType = headers.get('Content-Type');
return new Response('{}', { status: 200 });
}) as typeof fetch;
await updateDocumentMetadata('doc-headers', { original_filename: 'renamed.pdf' });
assert(metadataContentType === 'application/json', `Expected JSON content type to be preserved, got "${metadataContentType}"`);
assert(metadataAuthHeader === null, `Expected no auth header, got "${metadataAuthHeader}"`);
assert(metadataCsrfHeader === 'csrf-session-token', `Expected CSRF header, got "${metadataCsrfHeader}"`);
globalThis.fetch = (async (): Promise<Response> => {
return new Response(
JSON.stringify({
access_token: 'issued-session-token',
token_type: 'bearer',
expires_at: '2026-03-01T10:30:00Z',
user: {
id: '3a42f5e0-b1ad-4f68-b2f4-3fa8c2fb31c9',
username: 'admin',
role: 'admin',
},
}),
{ status: 200, headers: { 'Content-Type': 'application/json' } },
);
}) as typeof fetch;
const loginPayload = await loginWithPassword('admin', 'password');
assert(loginPayload.access_token === 'issued-session-token', 'Unexpected issued session token in login payload');
assert(loginPayload.user.username === 'admin', 'Unexpected login user payload');
globalThis.fetch = (async (): Promise<Response> => {
return new Response(
JSON.stringify({
expires_at: '2026-03-01T10:30:00Z',
user: {
id: '3a42f5e0-b1ad-4f68-b2f4-3fa8c2fb31c9',
username: 'admin',
role: 'admin',
},
}),
{ status: 200, headers: { 'Content-Type': 'application/json' } },
);
}) as typeof fetch;
const sessionPayload = await getCurrentAuthSession();
assert(sessionPayload.user.role === 'admin', 'Expected admin role from auth session payload');
globalThis.fetch = (async (): Promise<Response> => {
return new Response('{}', { status: 200, headers: { 'Content-Type': 'application/json' } });
}) as typeof fetch;
await logoutCurrentSession();
globalThis.fetch = (async (): Promise<Response> => { globalThis.fetch = (async (): Promise<Response> => {
return new Response('file-bytes', { return new Response('file-bytes', {
@@ -79,6 +170,11 @@ async function runApiTests(): Promise<void> {
await assertRejects(async () => downloadDocumentContentMarkdown('doc-4'), 'Failed to download document markdown'); await assertRejects(async () => downloadDocumentContentMarkdown('doc-4'), 'Failed to download document markdown');
} finally { } finally {
globalThis.fetch = originalFetch; globalThis.fetch = originalFetch;
if (originalDocument !== undefined) {
globalWithDocument.document = originalDocument;
} else {
delete globalWithDocument.document;
}
} }
} }

View File

@@ -4,6 +4,8 @@
import type { import type {
AppSettings, AppSettings,
AppSettingsUpdate, AppSettingsUpdate,
AuthLoginResponse,
AuthSessionInfo,
DocumentListResponse, DocumentListResponse,
DmsDocument, DmsDocument,
DmsDocumentDetail, DmsDocumentDetail,
@@ -14,43 +16,140 @@ import type {
} from '../types'; } from '../types';
/** /**
* Resolves backend base URL from environment with localhost fallback. * Resolves backend base URL from environment with same-origin proxy fallback.
*/ */
const API_BASE = import.meta.env?.VITE_API_BASE ?? 'http://localhost:8000/api/v1'; function resolveApiBase(): string {
const envValue = import.meta.env?.VITE_API_BASE;
if (typeof envValue === 'string') {
const trimmed = envValue.trim().replace(/\/+$/, '');
if (trimmed) {
return trimmed;
}
}
if (typeof window !== 'undefined' && window.location?.origin) {
return '/api/v1';
}
return 'http://localhost:8000/api/v1';
}
const API_BASE = resolveApiBase();
/** /**
* Optional bearer token used for authenticated backend routes. * CSRF cookie contract used by authenticated requests.
*/ */
const API_TOKEN = import.meta.env?.VITE_API_TOKEN?.trim(); const CSRF_COOKIE_NAME = "dcm_csrf";
const CSRF_HEADER_NAME = "x-csrf-token";
const CSRF_SAFE_METHODS = new Set(["GET", "HEAD", "OPTIONS"]);
const CSRF_SESSION_STORAGE_KEY = "dcm_csrf_token";
type ApiRequestInit = Omit<RequestInit, 'headers'> & { headers?: HeadersInit }; type ApiRequestInit = Omit<RequestInit, 'headers'> & { headers?: HeadersInit };
type ApiErrorPayload = { detail?: string } | null;
/** /**
* Merges request headers and appends bearer authorization when configured. * Returns a cookie value by name for the active browser runtime.
*/ */
function buildRequestHeaders(headers?: HeadersInit): Headers | undefined { function getCookieValue(name: string): string | undefined {
if (!API_TOKEN && !headers) { if (typeof document === "undefined") {
return undefined; return undefined;
} }
const rawCookie = document.cookie ?? "";
return rawCookie
.split(";")
.map((entry) => entry.trim())
.find((entry) => entry.startsWith(`${name}=`))
?.slice(name.length + 1);
}
/**
* Resolves the runtime CSRF token from browser cookie storage for API requests.
*/
function resolveCsrfToken(): string | undefined {
const cookieToken = getCookieValue(CSRF_COOKIE_NAME);
if (cookieToken) {
return cookieToken;
}
return loadStoredCsrfToken();
}
/**
* Loads the runtime CSRF token from browser session storage.
*/
function loadStoredCsrfToken(): string | undefined {
if (typeof window === "undefined") {
return undefined;
}
const rawValue = window.sessionStorage.getItem(CSRF_SESSION_STORAGE_KEY);
const normalizedValue = rawValue?.trim();
return normalizedValue ? normalizedValue : undefined;
}
/**
* Persists or clears a runtime CSRF token in browser session storage.
*/
function persistCsrfToken(token: string | undefined | null): void {
if (typeof window === "undefined") {
return;
}
const normalizedValue = typeof token === "string" ? token.trim() : "";
if (!normalizedValue) {
window.sessionStorage.removeItem(CSRF_SESSION_STORAGE_KEY);
return;
}
window.sessionStorage.setItem(CSRF_SESSION_STORAGE_KEY, normalizedValue);
}
/**
* Returns whether a method should include CSRF metadata.
*/
function requiresCsrfHeader(method: string): boolean {
const normalizedMethod = method.toUpperCase();
return !CSRF_SAFE_METHODS.has(normalizedMethod);
}
/**
* Merges request headers and appends CSRF metadata for state-changing requests.
*/
function buildRequestHeaders(method: string, headers?: HeadersInit): Headers | undefined {
const requestHeaders = new Headers(headers); const requestHeaders = new Headers(headers);
if (API_TOKEN) { if (method && requiresCsrfHeader(method)) {
requestHeaders.set('Authorization', `Bearer ${API_TOKEN}`); const csrfToken = resolveCsrfToken();
if (csrfToken) {
requestHeaders.set(CSRF_HEADER_NAME, csrfToken);
}
} }
return requestHeaders; return requestHeaders;
} }
/** /**
* Executes an API request with centralized auth-header handling. * Executes an API request with shared fetch options and CSRF handling.
*/ */
function apiRequest(input: string, init: ApiRequestInit = {}): Promise<Response> { function apiRequest(input: string, init: ApiRequestInit = {}): Promise<Response> {
const headers = buildRequestHeaders(init.headers); const method = init.method ?? "GET";
const headers = buildRequestHeaders(method, init.headers);
return fetch(input, { return fetch(input, {
...init, ...init,
credentials: 'include',
...(headers ? { headers } : {}), ...(headers ? { headers } : {}),
}); });
} }
/**
* Extracts backend error detail text from JSON payloads when available.
*/
async function responseErrorDetail(response: Response): Promise<string> {
try {
const payload = (await response.json()) as ApiErrorPayload;
if (payload && typeof payload.detail === 'string' && payload.detail.trim()) {
return payload.detail.trim();
}
} catch {
return '';
}
return '';
}
/** /**
* Encodes query parameters while skipping undefined and null values. * Encodes query parameters while skipping undefined and null values.
*/ */
@@ -94,6 +193,65 @@ export function downloadBlobFile(blob: Blob, filename: string): void {
}, 0); }, 0);
} }
/**
* Authenticates one user and returns authenticated session metadata.
*/
export async function loginWithPassword(username: string, password: string): Promise<AuthLoginResponse> {
const response = await fetch(`${API_BASE}/auth/login`, {
method: 'POST',
credentials: 'include',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
username: username.trim(),
password,
}),
});
if (!response.ok) {
const detail = await responseErrorDetail(response);
if (detail) {
throw new Error(detail);
}
throw new Error('Login failed');
}
const payload = await (response.json() as Promise<AuthLoginResponse>);
persistCsrfToken(payload.csrf_token);
return payload;
}
/**
* Loads currently authenticated user session metadata.
*/
export async function getCurrentAuthSession(): Promise<AuthSessionInfo> {
const response = await apiRequest(`${API_BASE}/auth/me`);
if (!response.ok) {
const detail = await responseErrorDetail(response);
if (detail) {
throw new Error(detail);
}
throw new Error('Failed to load authentication session');
}
const payload = await (response.json() as Promise<AuthSessionInfo>);
persistCsrfToken(payload.csrf_token);
return payload;
}
/**
* Revokes the current authenticated session.
*/
export async function logoutCurrentSession(): Promise<void> {
const response = await apiRequest(`${API_BASE}/auth/logout`, {
method: 'POST',
});
persistCsrfToken(undefined);
if (!response.ok && response.status !== 401) {
const detail = await responseErrorDetail(response);
if (detail) {
throw new Error(detail);
}
throw new Error('Failed to logout');
}
}
/** /**
* Loads documents from the backend list endpoint. * Loads documents from the backend list endpoint.
*/ */
@@ -495,7 +653,8 @@ export async function updateAppSettings(payload: AppSettingsUpdate): Promise<App
body: JSON.stringify(payload), body: JSON.stringify(payload),
}); });
if (!response.ok) { if (!response.ok) {
throw new Error('Failed to update settings'); const detail = await responseErrorDetail(response);
throw new Error(detail ? `Failed to update settings: ${detail}` : 'Failed to update settings');
} }
return response.json() as Promise<AppSettings>; return response.json() as Promise<AppSettings>;
} }

View File

@@ -4,11 +4,58 @@
.app-shell { .app-shell {
width: min(1820px, 100% - 2rem); width: min(1820px, 100% - 2rem);
margin: 0 auto; margin: 0 auto;
padding: var(--space-3) 0 var(--space-4); padding: 0 0 var(--space-4);
display: grid; display: grid;
gap: var(--space-3); gap: var(--space-3);
} }
.auth-shell {
min-height: 100vh;
display: grid;
place-items: center;
padding: var(--space-4) var(--space-2);
}
.auth-card {
width: min(430px, 100%);
display: grid;
gap: var(--space-2);
padding: var(--space-3);
border: 1px solid var(--color-border-strong);
border-radius: var(--radius-lg);
background: linear-gradient(180deg, rgba(28, 42, 63, 0.95) 0%, rgba(20, 30, 47, 0.95) 100%);
box-shadow: var(--shadow-soft);
}
.auth-card h1 {
margin: 0;
font-family: var(--font-display);
font-size: clamp(1.4rem, 2.1vw, 2rem);
}
.auth-card p {
margin: 0;
color: var(--color-text-muted);
font-size: 0.88rem;
}
.auth-form {
display: grid;
gap: var(--space-2);
}
.auth-form label {
display: grid;
gap: 0.35rem;
font-size: 0.8rem;
color: var(--color-text-muted);
}
.auth-form button {
margin-top: 0.25rem;
min-height: 2.1rem;
}
.app-shell > * { .app-shell > * {
animation: rise-in 220ms ease both; animation: rise-in 220ms ease both;
} }
@@ -23,18 +70,33 @@
.topbar { .topbar {
position: sticky; position: sticky;
top: var(--space-2); top: 0;
z-index: 50; z-index: 50;
left: 0;
width: 100vw;
margin-left: calc(50% - 50vw);
margin-right: calc(50% - 50vw);
padding: 0;
border: 1px solid var(--color-border-strong);
border-radius: 0;
background: linear-gradient(180deg, rgba(28, 42, 63, 0.96) 0%, rgba(20, 30, 47, 0.96) 100%);
box-shadow: var(--shadow-soft);
backdrop-filter: blur(10px);
}
.topbar-inner {
width: min(1820px, 100% - 2rem);
margin: 0 auto;
display: grid; display: grid;
grid-template-columns: minmax(260px, 1fr) auto; grid-template-columns: minmax(260px, 1fr) auto;
gap: var(--space-3); gap: var(--space-3);
align-items: start; align-items: start;
padding: var(--space-3); padding: var(--space-3);
border: 1px solid var(--color-border-strong); }
border-radius: var(--radius-lg);
background: linear-gradient(180deg, rgba(28, 42, 63, 0.96) 0%, rgba(20, 30, 47, 0.96) 100%); .topbar-brand {
box-shadow: var(--shadow-soft); display: grid;
backdrop-filter: blur(10px); gap: 0;
} }
.topbar h1 { .topbar h1 {
@@ -50,12 +112,39 @@
font-size: 0.85rem; font-size: 0.85rem;
} }
.topbar-auth-status {
display: inline-flex;
align-items: center;
gap: 0.35rem;
margin-top: 0.45rem;
color: var(--color-text-muted);
font-size: 0.76rem;
}
.topbar-user-icon {
width: 0.85rem;
height: 0.85rem;
}
.topbar-current-username {
color: var(--color-text);
font-family: var(--font-mono);
font-size: 0.76rem;
}
.topbar-controls { .topbar-controls {
display: grid; display: grid;
gap: var(--space-2); gap: var(--space-2);
justify-items: end; justify-items: end;
} }
.topbar-primary-row {
display: flex;
align-items: center;
justify-content: flex-end;
gap: var(--space-2);
}
.topbar-nav-group, .topbar-nav-group,
.topbar-document-group, .topbar-document-group,
.topbar-settings-group { .topbar-settings-group {
@@ -65,6 +154,21 @@
gap: var(--space-2); gap: var(--space-2);
} }
.topbar-icon-action {
width: 2.05rem;
min-height: 2.05rem;
padding: 0;
display: inline-flex;
align-items: center;
justify-content: center;
border-radius: var(--radius-xs);
}
.topbar-signout-icon {
width: 0.92rem;
height: 0.92rem;
}
.topbar-document-group .upload-actions-inline { .topbar-document-group .upload-actions-inline {
display: flex; display: flex;
gap: var(--space-2); gap: var(--space-2);
@@ -1244,6 +1348,12 @@ button:disabled {
} }
.topbar { .topbar {
width: 100%;
margin-left: 0;
margin-right: 0;
}
.topbar-inner {
grid-template-columns: 1fr; grid-template-columns: 1fr;
} }
@@ -1252,10 +1362,16 @@ button:disabled {
} }
.topbar-nav-group, .topbar-nav-group,
.topbar-primary-row,
.topbar-document-group, .topbar-document-group,
.topbar-settings-group { .topbar-settings-group {
justify-content: flex-start; justify-content: flex-start;
} }
.topbar-primary-row {
justify-content: space-between;
width: 100%;
}
} }
@media (max-width: 1040px) { @media (max-width: 1040px) {
@@ -1340,12 +1456,14 @@ button:disabled {
@media (max-width: 560px) { @media (max-width: 560px) {
.topbar-nav-group, .topbar-nav-group,
.topbar-primary-row,
.topbar-document-group, .topbar-document-group,
.topbar-settings-group { .topbar-settings-group {
width: 100%; width: 100%;
} }
.topbar-nav-group button, .topbar-nav-group button,
.topbar-primary-row button,
.topbar-document-group button, .topbar-document-group button,
.topbar-settings-group button { .topbar-settings-group button {
flex: 1; flex: 1;

View File

@@ -58,6 +58,33 @@ export interface SearchResponse {
items: DmsDocument[]; items: DmsDocument[];
} }
/**
* Represents one authenticated user identity returned by backend auth endpoints.
*/
export interface AuthUser {
id: string;
username: string;
role: 'admin' | 'user';
}
/**
* Represents active authentication session metadata.
*/
export interface AuthSessionInfo {
user: AuthUser;
expires_at: string;
csrf_token?: string;
}
/**
* Represents login response payload with issued session metadata.
*/
export interface AuthLoginResponse extends AuthSessionInfo {
access_token?: string;
token_type: 'bearer';
csrf_token?: string;
}
/** /**
* Represents distinct document type values available for filter controls. * Represents distinct document type values available for filter controls.
*/ */

View File

@@ -15,5 +15,6 @@
"noFallthroughCasesInSwitch": true, "noFallthroughCasesInSwitch": true,
"types": ["vite/client", "react", "react-dom"] "types": ["vite/client", "react", "react-dom"]
}, },
"include": ["src"] "include": ["src"],
"exclude": ["src/**/*.test.ts", "src/**/*.test.tsx"]
} }

View File

@@ -1,14 +1,93 @@
/** /**
* Vite configuration for the DMS frontend application. * Vite configuration for the DMS frontend application.
*/ */
import { defineConfig } from 'vite'; import { defineConfig, loadEnv } from 'vite';
/**
* Parses a comma-separated environment value into normalized entries.
*
* @param rawValue Raw comma-separated value.
* @returns List of non-empty normalized entries.
*/
function parseCsvList(rawValue: string | undefined): string[] {
if (!rawValue) {
return [];
}
return rawValue
.split(',')
.map((entry) => entry.trim())
.filter((entry) => entry.length > 0);
}
/**
* Extracts hostnames from CORS origin values.
*
* @param rawValue JSON array string or comma-separated origin list.
* @returns Hostnames parsed from valid origins.
*/
function parseCorsOriginHosts(rawValue: string | undefined): string[] {
if (!rawValue) {
return [];
}
let origins: string[] = [];
try {
const parsedOrigins = JSON.parse(rawValue);
if (Array.isArray(parsedOrigins)) {
origins = parsedOrigins.filter((entry): entry is string => typeof entry === 'string');
} else if (typeof parsedOrigins === 'string') {
origins = [parsedOrigins];
}
} catch {
origins = parseCsvList(rawValue);
}
return origins.flatMap((origin) => {
try {
const parsedUrl = new URL(origin);
return parsedUrl.hostname ? [parsedUrl.hostname] : [];
} catch {
return [];
}
});
}
/**
* Builds the Vite allowed host list from environment-driven inputs.
*
* @param env Environment variable key-value map.
* @returns De-duplicated hostnames, or undefined to keep Vite defaults.
*/
function buildAllowedHosts(env: Record<string, string>): string[] | undefined {
const explicitHosts = parseCsvList(env.VITE_ALLOWED_HOSTS);
const corsOriginHosts = parseCorsOriginHosts(env.CORS_ORIGINS);
const mergedHosts = Array.from(new Set([...explicitHosts, ...corsOriginHosts]));
return mergedHosts.length > 0 ? mergedHosts : undefined;
}
/** /**
* Exports frontend build and dev-server settings. * Exports frontend build and dev-server settings.
*/ */
export default defineConfig({ export default defineConfig(({ mode }) => {
const env = loadEnv(mode, process.cwd(), '');
const allowedHosts = buildAllowedHosts(env);
const apiProxyTarget = env.VITE_API_PROXY_TARGET?.trim() || 'http://localhost:8000';
return {
server: { server: {
host: '0.0.0.0', host: '0.0.0.0',
port: 5173, port: 5173,
proxy: {
'/api': {
target: apiProxyTarget,
changeOrigin: false,
secure: false,
}, },
},
...(allowedHosts ? { allowedHosts } : {}),
},
};
}); });