"""Authenticated document CRUD, lifecycle, metadata, file access, and content export endpoints.""" import io import re import unicodedata import zipfile from datetime import datetime, time from pathlib import Path from typing import Annotated, Literal from uuid import UUID from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile from fastapi.responses import FileResponse, Response, StreamingResponse from sqlalchemy import or_, func, select from sqlalchemy.orm import Session from app.core.config import get_settings, is_inline_preview_mime_type_safe from app.db.base import get_session from app.models.document import Document, DocumentStatus from app.schemas.documents import ( ContentExportRequest, DocumentDetailResponse, DocumentResponse, DocumentsListResponse, DocumentUpdateRequest, UploadConflict, UploadResponse, ) from app.services.app_settings import read_predefined_paths_settings, read_predefined_tags_settings from app.services.extractor import sniff_mime from app.services.handwriting_style import delete_many_handwriting_style_documents from app.services.processing_logs import log_processing_event, set_processing_log_autocommit from app.services.storage import absolute_path, compute_sha256, store_bytes from app.services.typesense_index import delete_many_documents_index, upsert_document_index from app.worker.queue import get_processing_queue router = APIRouter() settings = get_settings() def _parse_csv(value: str | None) -> list[str]: """Parses comma-separated query values into a normalized non-empty list.""" if not value: return [] return [part.strip() for part in value.split(",") if part.strip()] def _parse_date(value: str | None) -> datetime | None: """Parses ISO date strings into UTC-naive midnight datetimes.""" if not value: return None try: parsed = datetime.fromisoformat(value) return parsed except ValueError: pass try: date_value = datetime.strptime(value, "%Y-%m-%d").date() return datetime.combine(date_value, time.min) except ValueError: return None def _apply_discovery_filters( statement, *, path_filter: str | None, tag_filter: str | None, type_filter: str | None, processed_from: str | None, processed_to: str | None, ): """Applies optional path/tag/type/date filters to list and search statements.""" if path_filter and path_filter.strip(): statement = statement.where(Document.logical_path.ilike(f"{path_filter.strip()}%")) tags = _parse_csv(tag_filter) if tags: statement = statement.where(Document.tags.overlap(tags)) types = _parse_csv(type_filter) if types: type_clauses = [] for value in types: lowered = value.lower() type_clauses.append(Document.extension.ilike(lowered)) type_clauses.append(Document.mime_type.ilike(lowered)) type_clauses.append(Document.image_text_type.ilike(lowered)) statement = statement.where(or_(*type_clauses)) processed_from_dt = _parse_date(processed_from) if processed_from_dt is not None: statement = statement.where(Document.processed_at.is_not(None), Document.processed_at >= processed_from_dt) processed_to_dt = _parse_date(processed_to) if processed_to_dt is not None: statement = statement.where(Document.processed_at.is_not(None), Document.processed_at <= processed_to_dt) return statement def _summary_for_index(document: Document) -> str: """Resolves best-available summary text for semantic index updates outside worker pipeline.""" candidate = document.metadata_json.get("summary_text") if isinstance(candidate, str) and candidate.strip(): return candidate.strip() extracted = document.extracted_text.strip() if extracted: return extracted[:12000] return f"{document.original_filename}\n{document.mime_type}\n{document.logical_path}" def _normalize_tags(raw_tags: str | None) -> list[str]: """Parses comma-separated tags into a cleaned unique list.""" if not raw_tags: return [] tags = [tag.strip() for tag in raw_tags.split(",") if tag.strip()] return list(dict.fromkeys(tags))[:50] def _sanitize_filename(filename: str) -> str: """Normalizes user-supplied filenames while preserving readability and extensions.""" base = filename.strip().replace("\\", " ").replace("/", " ") base = re.sub(r"\s+", " ", base) return base[:512] or "document" def _slugify_segment(value: str) -> str: """Creates a filesystem-safe slug for path segments and markdown file names.""" normalized = unicodedata.normalize("NFKD", value) ascii_text = normalized.encode("ascii", "ignore").decode("ascii") cleaned = re.sub(r"[^a-zA-Z0-9._ -]+", "", ascii_text).strip() compact = re.sub(r"\s+", "-", cleaned) compact = compact.strip(".-_") return compact[:120] or "document" def _markdown_for_document(document: Document) -> str: """Builds a markdown representation of extracted document content and metadata.""" lines = [ f"# {document.original_filename}", "", f"- Document ID: `{document.id}`", f"- Logical Path: `{document.logical_path}`", f"- Source Path: `{document.source_relative_path}`", f"- Tags: {', '.join(document.tags) if document.tags else '(none)' }", "", "## Extracted Content", "", ] if document.extracted_text.strip(): lines.append(document.extracted_text) else: lines.append("_No extracted text available for this document._") return "\n".join(lines).strip() + "\n" def _markdown_filename(document: Document) -> str: """Builds a deterministic markdown filename for a single document export.""" stem = Path(document.original_filename).stem or document.original_filename slug = _slugify_segment(stem) return f"{slug}-{str(document.id)[:8]}.md" def _zip_entry_name(document: Document, used_names: set[str]) -> str: """Builds a unique zip entry path for a document markdown export.""" path_segments = [segment for segment in document.logical_path.split("/") if segment] sanitized_segments = [_slugify_segment(segment) for segment in path_segments] filename = _markdown_filename(document) base_entry = "/".join([*sanitized_segments, filename]) if sanitized_segments else filename entry = base_entry suffix = 1 while entry in used_names: stem = Path(filename).stem ext = Path(filename).suffix candidate = f"{stem}-{suffix}{ext}" entry = "/".join([*sanitized_segments, candidate]) if sanitized_segments else candidate suffix += 1 used_names.add(entry) return entry def _resolve_previous_status(metadata_json: dict, fallback_status: DocumentStatus) -> DocumentStatus: """Resolves the status to restore from trash using recorded metadata.""" raw_status = metadata_json.get("status_before_trash") if isinstance(raw_status, str): try: parsed = DocumentStatus(raw_status) if parsed != DocumentStatus.TRASHED: return parsed except ValueError: pass return fallback_status def _build_document_list_statement( only_trashed: bool, include_trashed: bool, path_prefix: str | None, ): """Builds a base SQLAlchemy select statement with lifecycle and path filters.""" statement = select(Document) if only_trashed: statement = statement.where(Document.status == DocumentStatus.TRASHED) elif not include_trashed: statement = statement.where(Document.status != DocumentStatus.TRASHED) if path_prefix: trimmed_prefix = path_prefix.strip() if trimmed_prefix: statement = statement.where(Document.logical_path.ilike(f"{trimmed_prefix}%")) return statement def _enforce_upload_shape(files: list[UploadFile]) -> None: """Validates upload request shape against configured file-count bounds.""" if not files: raise HTTPException(status_code=400, detail="Upload request must include at least one file") if len(files) > settings.max_upload_files_per_request: raise HTTPException( status_code=413, detail=( "Upload request exceeds file count limit " f"({len(files)} > {settings.max_upload_files_per_request})" ), ) async def _read_upload_bytes(file: UploadFile, max_bytes: int) -> bytes: """Reads one upload file while enforcing per-file byte limits.""" data = await file.read(max_bytes + 1) if len(data) > max_bytes: raise HTTPException( status_code=413, detail=f"File '{file.filename or 'upload'}' exceeds per-file limit of {max_bytes} bytes", ) return data def _collect_document_tree(session: Session, root_document_id: UUID) -> list[tuple[int, Document]]: """Collects a document and all descendants for recursive permanent deletion.""" queue: list[tuple[UUID, int]] = [(root_document_id, 0)] visited: set[UUID] = set() collected: list[tuple[int, Document]] = [] while queue: current_id, depth = queue.pop(0) if current_id in visited: continue visited.add(current_id) document = session.execute(select(Document).where(Document.id == current_id)).scalar_one_or_none() if document is None: continue collected.append((depth, document)) child_ids = session.execute( select(Document.id).where(Document.parent_document_id == current_id) ).scalars().all() for child_id in child_ids: queue.append((child_id, depth + 1)) collected.sort(key=lambda item: item[0], reverse=True) return collected @router.get("", response_model=DocumentsListResponse) def list_documents( offset: int = Query(default=0, ge=0), limit: int = Query(default=50, ge=1, le=200), include_trashed: bool = Query(default=False), only_trashed: bool = Query(default=False), path_prefix: str | None = Query(default=None), path_filter: str | None = Query(default=None), tag_filter: str | None = Query(default=None), type_filter: str | None = Query(default=None), processed_from: str | None = Query(default=None), processed_to: str | None = Query(default=None), session: Session = Depends(get_session), ) -> DocumentsListResponse: """Returns paginated documents ordered by newest upload timestamp.""" base_statement = _build_document_list_statement( only_trashed=only_trashed, include_trashed=include_trashed, path_prefix=path_prefix, ) base_statement = _apply_discovery_filters( base_statement, path_filter=path_filter, tag_filter=tag_filter, type_filter=type_filter, processed_from=processed_from, processed_to=processed_to, ) statement = base_statement.order_by(Document.created_at.desc()).offset(offset).limit(limit) items = session.execute(statement).scalars().all() count_statement = select(func.count()).select_from(base_statement.subquery()) total = session.execute(count_statement).scalar_one() return DocumentsListResponse(total=total, items=[DocumentResponse.model_validate(item) for item in items]) @router.get("/tags") def list_tags( include_trashed: bool = Query(default=False), session: Session = Depends(get_session), ) -> dict[str, list[str]]: """Returns distinct tags currently assigned across all matching documents.""" statement = select(Document.tags) if not include_trashed: statement = statement.where(Document.status != DocumentStatus.TRASHED) rows = session.execute(statement).scalars().all() tags = {tag for row in rows for tag in row if tag} tags.update( str(item.get("value", "")).strip() for item in read_predefined_tags_settings() if str(item.get("value", "")).strip() ) tags = sorted(tags) return {"tags": tags} @router.get("/paths") def list_paths( include_trashed: bool = Query(default=False), session: Session = Depends(get_session), ) -> dict[str, list[str]]: """Returns distinct logical paths currently assigned across all matching documents.""" statement = select(Document.logical_path) if not include_trashed: statement = statement.where(Document.status != DocumentStatus.TRASHED) rows = session.execute(statement).scalars().all() paths = {row for row in rows if row} paths.update( str(item.get("value", "")).strip() for item in read_predefined_paths_settings() if str(item.get("value", "")).strip() ) paths = sorted(paths) return {"paths": paths} @router.get("/types") def list_types( include_trashed: bool = Query(default=False), session: Session = Depends(get_session), ) -> dict[str, list[str]]: """Returns distinct document type values from extension, MIME, and image text type.""" statement = select(Document.extension, Document.mime_type, Document.image_text_type) if not include_trashed: statement = statement.where(Document.status != DocumentStatus.TRASHED) rows = session.execute(statement).all() values: set[str] = set() for extension, mime_type, image_text_type in rows: for candidate in (extension, mime_type, image_text_type): normalized = str(candidate).strip().lower() if isinstance(candidate, str) else "" if normalized: values.add(normalized) return {"types": sorted(values)} @router.post("/content-md/export") def export_contents_markdown( payload: ContentExportRequest, session: Session = Depends(get_session), ) -> StreamingResponse: """Exports extracted contents for selected documents as individual markdown files in a ZIP archive.""" has_document_ids = len(payload.document_ids) > 0 has_path_prefix = bool(payload.path_prefix and payload.path_prefix.strip()) if not has_document_ids and not has_path_prefix: raise HTTPException(status_code=400, detail="Provide document_ids or path_prefix for export") statement = select(Document) if has_document_ids: statement = statement.where(Document.id.in_(payload.document_ids)) if has_path_prefix: statement = statement.where(Document.logical_path.ilike(f"{payload.path_prefix.strip()}%")) if payload.only_trashed: statement = statement.where(Document.status == DocumentStatus.TRASHED) elif not payload.include_trashed: statement = statement.where(Document.status != DocumentStatus.TRASHED) documents = session.execute(statement.order_by(Document.logical_path.asc(), Document.created_at.asc())).scalars().all() if not documents: raise HTTPException(status_code=404, detail="No matching documents found for export") archive_buffer = io.BytesIO() used_entries: set[str] = set() with zipfile.ZipFile(archive_buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive: for document in documents: entry_name = _zip_entry_name(document, used_entries) archive.writestr(entry_name, _markdown_for_document(document)) archive_buffer.seek(0) headers = {"Content-Disposition": 'attachment; filename="document-contents-md.zip"'} return StreamingResponse(archive_buffer, media_type="application/zip", headers=headers) @router.get("/{document_id}", response_model=DocumentDetailResponse) def get_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentDetailResponse: """Returns one document by unique identifier.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") return DocumentDetailResponse.model_validate(document) @router.get("/{document_id}/download") def download_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse: """Downloads original document bytes for the requested document identifier.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") file_path = absolute_path(document.stored_relative_path) return FileResponse(path=file_path, filename=document.original_filename, media_type=document.mime_type) @router.get("/{document_id}/preview") def preview_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse: """Streams trusted-safe MIME types inline and forces attachment for active script-capable types.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") original_path = absolute_path(document.stored_relative_path) common_headers = {"X-Content-Type-Options": "nosniff"} if not is_inline_preview_mime_type_safe(document.mime_type): return FileResponse( path=original_path, filename=document.original_filename, media_type="application/octet-stream", headers=common_headers, ) return FileResponse(path=original_path, media_type=document.mime_type, headers=common_headers) @router.get("/{document_id}/thumbnail") def thumbnail_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse: """Returns a generated thumbnail image for dashboard card previews.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") preview_relative_path = document.metadata_json.get("preview_relative_path") if not preview_relative_path: raise HTTPException(status_code=404, detail="Thumbnail not available") preview_path = absolute_path(preview_relative_path) if not preview_path.exists(): raise HTTPException(status_code=404, detail="Thumbnail file not found") return FileResponse(path=preview_path) @router.get("/{document_id}/content-md") def download_document_content_markdown(document_id: UUID, session: Session = Depends(get_session)) -> Response: """Downloads extracted content for one document as a markdown file.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") markdown_content = _markdown_for_document(document) filename = _markdown_filename(document) headers = {"Content-Disposition": f'attachment; filename="{filename}"'} return Response(content=markdown_content, media_type="text/markdown; charset=utf-8", headers=headers) @router.post("/upload", response_model=UploadResponse) async def upload_documents( files: Annotated[list[UploadFile], File(description="Files to upload")], relative_paths: Annotated[list[str] | None, Form()] = None, logical_path: Annotated[str, Form()] = "Inbox", tags: Annotated[str | None, Form()] = None, conflict_mode: Annotated[Literal["ask", "replace", "duplicate"], Form()] = "ask", session: Session = Depends(get_session), ) -> UploadResponse: """Uploads files, records metadata, and enqueues asynchronous extraction tasks.""" _enforce_upload_shape(files) set_processing_log_autocommit(session, True) normalized_tags = _normalize_tags(tags) queue = get_processing_queue() uploaded: list[DocumentResponse] = [] conflicts: list[UploadConflict] = [] total_request_bytes = 0 indexed_relative_paths = relative_paths or [] prepared_uploads: list[dict[str, object]] = [] for idx, file in enumerate(files): filename = file.filename or f"uploaded_{idx}" data = await _read_upload_bytes(file, settings.max_upload_file_size_bytes) total_request_bytes += len(data) if total_request_bytes > settings.max_upload_request_size_bytes: raise HTTPException( status_code=413, detail=( "Upload request exceeds total size limit " f"({total_request_bytes} > {settings.max_upload_request_size_bytes} bytes)" ), ) sha256 = compute_sha256(data) source_relative_path = indexed_relative_paths[idx] if idx < len(indexed_relative_paths) else filename extension = Path(filename).suffix.lower() detected_mime = sniff_mime(data) log_processing_event( session=session, stage="upload", event="Upload request received", level="info", document_filename=filename, payload_json={ "source_relative_path": source_relative_path, "logical_path": logical_path, "tags": normalized_tags, "mime_type": detected_mime, "size_bytes": len(data), "conflict_mode": conflict_mode, }, ) prepared_uploads.append( { "filename": filename, "data": data, "sha256": sha256, "source_relative_path": source_relative_path, "extension": extension, "mime_type": detected_mime, } ) existing = session.execute(select(Document).where(Document.sha256 == sha256)).scalar_one_or_none() if existing and conflict_mode == "ask": log_processing_event( session=session, stage="upload", event="Upload conflict detected", level="warning", document_id=existing.id, document_filename=filename, payload_json={ "sha256": sha256, "existing_document_id": str(existing.id), }, ) conflicts.append( UploadConflict( original_filename=filename, sha256=sha256, existing_document_id=existing.id, ) ) if conflicts and conflict_mode == "ask": session.commit() return UploadResponse(uploaded=[], conflicts=conflicts) for prepared in prepared_uploads: existing = session.execute( select(Document).where(Document.sha256 == str(prepared["sha256"])) ).scalar_one_or_none() replaces_document_id = existing.id if existing and conflict_mode == "replace" else None stored_relative_path = store_bytes(str(prepared["filename"]), bytes(prepared["data"])) document = Document( original_filename=str(prepared["filename"]), source_relative_path=str(prepared["source_relative_path"]), stored_relative_path=stored_relative_path, mime_type=str(prepared["mime_type"]), extension=str(prepared["extension"]), sha256=str(prepared["sha256"]), size_bytes=len(bytes(prepared["data"])), logical_path=logical_path, tags=list(normalized_tags), replaces_document_id=replaces_document_id, metadata_json={"upload": "web"}, ) session.add(document) session.flush() queue.enqueue("app.worker.tasks.process_document_task", str(document.id)) log_processing_event( session=session, stage="upload", event="Document record created and queued", level="info", document=document, payload_json={ "source_relative_path": document.source_relative_path, "stored_relative_path": document.stored_relative_path, "logical_path": document.logical_path, "tags": list(document.tags), "replaces_document_id": str(replaces_document_id) if replaces_document_id is not None else None, }, ) uploaded.append(DocumentResponse.model_validate(document)) session.commit() return UploadResponse(uploaded=uploaded, conflicts=conflicts) @router.patch("/{document_id}", response_model=DocumentResponse) def update_document( document_id: UUID, payload: DocumentUpdateRequest, session: Session = Depends(get_session), ) -> DocumentResponse: """Updates document metadata and refreshes semantic index representation.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") if payload.original_filename is not None: document.original_filename = _sanitize_filename(payload.original_filename) if payload.logical_path is not None: document.logical_path = payload.logical_path.strip() or "Inbox" if payload.tags is not None: document.tags = list(dict.fromkeys([tag.strip() for tag in payload.tags if tag.strip()]))[:50] try: upsert_document_index(document=document, summary_text=_summary_for_index(document)) except Exception: pass session.commit() session.refresh(document) return DocumentResponse.model_validate(document) @router.post("/{document_id}/trash", response_model=DocumentResponse) def trash_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse: """Marks a document as trashed without deleting files from storage.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") if document.status != DocumentStatus.TRASHED: document.metadata_json = { **document.metadata_json, "status_before_trash": document.status.value, } document.status = DocumentStatus.TRASHED try: upsert_document_index(document=document, summary_text=_summary_for_index(document)) except Exception: pass session.commit() session.refresh(document) return DocumentResponse.model_validate(document) @router.post("/{document_id}/restore", response_model=DocumentResponse) def restore_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse: """Restores a trashed document to its previous lifecycle status.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") if document.status == DocumentStatus.TRASHED: fallback = DocumentStatus.PROCESSED if document.processed_at else DocumentStatus.QUEUED restored_status = _resolve_previous_status(document.metadata_json, fallback) document.status = restored_status metadata_json = dict(document.metadata_json) metadata_json.pop("status_before_trash", None) document.metadata_json = metadata_json try: upsert_document_index(document=document, summary_text=_summary_for_index(document)) except Exception: pass session.commit() session.refresh(document) return DocumentResponse.model_validate(document) @router.delete("/{document_id}") def delete_document(document_id: UUID, session: Session = Depends(get_session)) -> dict[str, int]: """Permanently deletes a document and all descendant archive members including stored files.""" root = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if root is None: raise HTTPException(status_code=404, detail="Document not found") if root.status != DocumentStatus.TRASHED: raise HTTPException(status_code=400, detail="Move document to trash before permanent deletion") document_tree = _collect_document_tree(session=session, root_document_id=document_id) document_ids = [document.id for _, document in document_tree] try: delete_many_documents_index([str(current_id) for current_id in document_ids]) except Exception: pass try: delete_many_handwriting_style_documents([str(current_id) for current_id in document_ids]) except Exception: pass deleted_files = 0 for _, document in document_tree: source_path = absolute_path(document.stored_relative_path) if source_path.exists() and source_path.is_file(): source_path.unlink(missing_ok=True) deleted_files += 1 preview_relative_path = document.metadata_json.get("preview_relative_path") if isinstance(preview_relative_path, str): preview_path = absolute_path(preview_relative_path) if preview_path.exists() and preview_path.is_file(): preview_path.unlink(missing_ok=True) session.delete(document) session.commit() return {"deleted_documents": len(document_tree), "deleted_files": deleted_files} @router.post("/{document_id}/reprocess", response_model=DocumentResponse) def reprocess_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse: """Re-enqueues a document for extraction and suggestion processing.""" document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none() if document is None: raise HTTPException(status_code=404, detail="Document not found") if document.status == DocumentStatus.TRASHED: raise HTTPException(status_code=400, detail="Restore document before reprocessing") queue = get_processing_queue() document.status = DocumentStatus.QUEUED try: upsert_document_index(document=document, summary_text=_summary_for_index(document)) except Exception: pass session.commit() queue.enqueue("app.worker.tasks.process_document_task", str(document.id)) session.refresh(document) return DocumentResponse.model_validate(document)