Files
ledgerdock/backend/app/api/routes_documents.py
2026-02-21 09:44:18 -03:00

726 lines
28 KiB
Python

"""Document CRUD, lifecycle, metadata, file access, and content export endpoints."""
import io
import re
import unicodedata
import zipfile
from datetime import datetime, time
from pathlib import Path
from typing import Annotated, Literal
from uuid import UUID
from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
from fastapi.responses import FileResponse, Response, StreamingResponse
from sqlalchemy import or_, func, select
from sqlalchemy.orm import Session
from app.services.app_settings import read_predefined_paths_settings, read_predefined_tags_settings
from app.db.base import get_session
from app.models.document import Document, DocumentStatus
from app.schemas.documents import (
ContentExportRequest,
DocumentDetailResponse,
DocumentResponse,
DocumentsListResponse,
DocumentUpdateRequest,
UploadConflict,
UploadResponse,
)
from app.services.extractor import sniff_mime
from app.services.handwriting_style import delete_many_handwriting_style_documents
from app.services.processing_logs import log_processing_event, set_processing_log_autocommit
from app.services.storage import absolute_path, compute_sha256, store_bytes
from app.services.typesense_index import delete_many_documents_index, upsert_document_index
from app.worker.queue import get_processing_queue
router = APIRouter()
def _parse_csv(value: str | None) -> list[str]:
"""Parses comma-separated query values into a normalized non-empty list."""
if not value:
return []
return [part.strip() for part in value.split(",") if part.strip()]
def _parse_date(value: str | None) -> datetime | None:
"""Parses ISO date strings into UTC-naive midnight datetimes."""
if not value:
return None
try:
parsed = datetime.fromisoformat(value)
return parsed
except ValueError:
pass
try:
date_value = datetime.strptime(value, "%Y-%m-%d").date()
return datetime.combine(date_value, time.min)
except ValueError:
return None
def _apply_discovery_filters(
statement,
*,
path_filter: str | None,
tag_filter: str | None,
type_filter: str | None,
processed_from: str | None,
processed_to: str | None,
):
"""Applies optional path/tag/type/date filters to list and search statements."""
if path_filter and path_filter.strip():
statement = statement.where(Document.logical_path.ilike(f"{path_filter.strip()}%"))
tags = _parse_csv(tag_filter)
if tags:
statement = statement.where(Document.tags.overlap(tags))
types = _parse_csv(type_filter)
if types:
type_clauses = []
for value in types:
lowered = value.lower()
type_clauses.append(Document.extension.ilike(lowered))
type_clauses.append(Document.mime_type.ilike(lowered))
type_clauses.append(Document.image_text_type.ilike(lowered))
statement = statement.where(or_(*type_clauses))
processed_from_dt = _parse_date(processed_from)
if processed_from_dt is not None:
statement = statement.where(Document.processed_at.is_not(None), Document.processed_at >= processed_from_dt)
processed_to_dt = _parse_date(processed_to)
if processed_to_dt is not None:
statement = statement.where(Document.processed_at.is_not(None), Document.processed_at <= processed_to_dt)
return statement
def _summary_for_index(document: Document) -> str:
"""Resolves best-available summary text for semantic index updates outside worker pipeline."""
candidate = document.metadata_json.get("summary_text")
if isinstance(candidate, str) and candidate.strip():
return candidate.strip()
extracted = document.extracted_text.strip()
if extracted:
return extracted[:12000]
return f"{document.original_filename}\n{document.mime_type}\n{document.logical_path}"
def _normalize_tags(raw_tags: str | None) -> list[str]:
"""Parses comma-separated tags into a cleaned unique list."""
if not raw_tags:
return []
tags = [tag.strip() for tag in raw_tags.split(",") if tag.strip()]
return list(dict.fromkeys(tags))[:50]
def _sanitize_filename(filename: str) -> str:
"""Normalizes user-supplied filenames while preserving readability and extensions."""
base = filename.strip().replace("\\", " ").replace("/", " ")
base = re.sub(r"\s+", " ", base)
return base[:512] or "document"
def _slugify_segment(value: str) -> str:
"""Creates a filesystem-safe slug for path segments and markdown file names."""
normalized = unicodedata.normalize("NFKD", value)
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
cleaned = re.sub(r"[^a-zA-Z0-9._ -]+", "", ascii_text).strip()
compact = re.sub(r"\s+", "-", cleaned)
compact = compact.strip(".-_")
return compact[:120] or "document"
def _markdown_for_document(document: Document) -> str:
"""Builds a markdown representation of extracted document content and metadata."""
lines = [
f"# {document.original_filename}",
"",
f"- Document ID: `{document.id}`",
f"- Logical Path: `{document.logical_path}`",
f"- Source Path: `{document.source_relative_path}`",
f"- Tags: {', '.join(document.tags) if document.tags else '(none)' }",
"",
"## Extracted Content",
"",
]
if document.extracted_text.strip():
lines.append(document.extracted_text)
else:
lines.append("_No extracted text available for this document._")
return "\n".join(lines).strip() + "\n"
def _markdown_filename(document: Document) -> str:
"""Builds a deterministic markdown filename for a single document export."""
stem = Path(document.original_filename).stem or document.original_filename
slug = _slugify_segment(stem)
return f"{slug}-{str(document.id)[:8]}.md"
def _zip_entry_name(document: Document, used_names: set[str]) -> str:
"""Builds a unique zip entry path for a document markdown export."""
path_segments = [segment for segment in document.logical_path.split("/") if segment]
sanitized_segments = [_slugify_segment(segment) for segment in path_segments]
filename = _markdown_filename(document)
base_entry = "/".join([*sanitized_segments, filename]) if sanitized_segments else filename
entry = base_entry
suffix = 1
while entry in used_names:
stem = Path(filename).stem
ext = Path(filename).suffix
candidate = f"{stem}-{suffix}{ext}"
entry = "/".join([*sanitized_segments, candidate]) if sanitized_segments else candidate
suffix += 1
used_names.add(entry)
return entry
def _resolve_previous_status(metadata_json: dict, fallback_status: DocumentStatus) -> DocumentStatus:
"""Resolves the status to restore from trash using recorded metadata."""
raw_status = metadata_json.get("status_before_trash")
if isinstance(raw_status, str):
try:
parsed = DocumentStatus(raw_status)
if parsed != DocumentStatus.TRASHED:
return parsed
except ValueError:
pass
return fallback_status
def _build_document_list_statement(
only_trashed: bool,
include_trashed: bool,
path_prefix: str | None,
):
"""Builds a base SQLAlchemy select statement with lifecycle and path filters."""
statement = select(Document)
if only_trashed:
statement = statement.where(Document.status == DocumentStatus.TRASHED)
elif not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
if path_prefix:
trimmed_prefix = path_prefix.strip()
if trimmed_prefix:
statement = statement.where(Document.logical_path.ilike(f"{trimmed_prefix}%"))
return statement
def _collect_document_tree(session: Session, root_document_id: UUID) -> list[tuple[int, Document]]:
"""Collects a document and all descendants for recursive permanent deletion."""
queue: list[tuple[UUID, int]] = [(root_document_id, 0)]
visited: set[UUID] = set()
collected: list[tuple[int, Document]] = []
while queue:
current_id, depth = queue.pop(0)
if current_id in visited:
continue
visited.add(current_id)
document = session.execute(select(Document).where(Document.id == current_id)).scalar_one_or_none()
if document is None:
continue
collected.append((depth, document))
child_ids = session.execute(
select(Document.id).where(Document.parent_document_id == current_id)
).scalars().all()
for child_id in child_ids:
queue.append((child_id, depth + 1))
collected.sort(key=lambda item: item[0], reverse=True)
return collected
@router.get("", response_model=DocumentsListResponse)
def list_documents(
offset: int = Query(default=0, ge=0),
limit: int = Query(default=50, ge=1, le=200),
include_trashed: bool = Query(default=False),
only_trashed: bool = Query(default=False),
path_prefix: str | None = Query(default=None),
path_filter: str | None = Query(default=None),
tag_filter: str | None = Query(default=None),
type_filter: str | None = Query(default=None),
processed_from: str | None = Query(default=None),
processed_to: str | None = Query(default=None),
session: Session = Depends(get_session),
) -> DocumentsListResponse:
"""Returns paginated documents ordered by newest upload timestamp."""
base_statement = _build_document_list_statement(
only_trashed=only_trashed,
include_trashed=include_trashed,
path_prefix=path_prefix,
)
base_statement = _apply_discovery_filters(
base_statement,
path_filter=path_filter,
tag_filter=tag_filter,
type_filter=type_filter,
processed_from=processed_from,
processed_to=processed_to,
)
statement = base_statement.order_by(Document.created_at.desc()).offset(offset).limit(limit)
items = session.execute(statement).scalars().all()
count_statement = select(func.count()).select_from(base_statement.subquery())
total = session.execute(count_statement).scalar_one()
return DocumentsListResponse(total=total, items=[DocumentResponse.model_validate(item) for item in items])
@router.get("/tags")
def list_tags(
include_trashed: bool = Query(default=False),
session: Session = Depends(get_session),
) -> dict[str, list[str]]:
"""Returns distinct tags currently assigned across all matching documents."""
statement = select(Document.tags)
if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).scalars().all()
tags = {tag for row in rows for tag in row if tag}
tags.update(
str(item.get("value", "")).strip()
for item in read_predefined_tags_settings()
if str(item.get("value", "")).strip()
)
tags = sorted(tags)
return {"tags": tags}
@router.get("/paths")
def list_paths(
include_trashed: bool = Query(default=False),
session: Session = Depends(get_session),
) -> dict[str, list[str]]:
"""Returns distinct logical paths currently assigned across all matching documents."""
statement = select(Document.logical_path)
if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).scalars().all()
paths = {row for row in rows if row}
paths.update(
str(item.get("value", "")).strip()
for item in read_predefined_paths_settings()
if str(item.get("value", "")).strip()
)
paths = sorted(paths)
return {"paths": paths}
@router.get("/types")
def list_types(
include_trashed: bool = Query(default=False),
session: Session = Depends(get_session),
) -> dict[str, list[str]]:
"""Returns distinct document type values from extension, MIME, and image text type."""
statement = select(Document.extension, Document.mime_type, Document.image_text_type)
if not include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
rows = session.execute(statement).all()
values: set[str] = set()
for extension, mime_type, image_text_type in rows:
for candidate in (extension, mime_type, image_text_type):
normalized = str(candidate).strip().lower() if isinstance(candidate, str) else ""
if normalized:
values.add(normalized)
return {"types": sorted(values)}
@router.post("/content-md/export")
def export_contents_markdown(
payload: ContentExportRequest,
session: Session = Depends(get_session),
) -> StreamingResponse:
"""Exports extracted contents for selected documents as individual markdown files in a ZIP archive."""
has_document_ids = len(payload.document_ids) > 0
has_path_prefix = bool(payload.path_prefix and payload.path_prefix.strip())
if not has_document_ids and not has_path_prefix:
raise HTTPException(status_code=400, detail="Provide document_ids or path_prefix for export")
statement = select(Document)
if has_document_ids:
statement = statement.where(Document.id.in_(payload.document_ids))
if has_path_prefix:
statement = statement.where(Document.logical_path.ilike(f"{payload.path_prefix.strip()}%"))
if payload.only_trashed:
statement = statement.where(Document.status == DocumentStatus.TRASHED)
elif not payload.include_trashed:
statement = statement.where(Document.status != DocumentStatus.TRASHED)
documents = session.execute(statement.order_by(Document.logical_path.asc(), Document.created_at.asc())).scalars().all()
if not documents:
raise HTTPException(status_code=404, detail="No matching documents found for export")
archive_buffer = io.BytesIO()
used_entries: set[str] = set()
with zipfile.ZipFile(archive_buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as archive:
for document in documents:
entry_name = _zip_entry_name(document, used_entries)
archive.writestr(entry_name, _markdown_for_document(document))
archive_buffer.seek(0)
headers = {"Content-Disposition": 'attachment; filename="document-contents-md.zip"'}
return StreamingResponse(archive_buffer, media_type="application/zip", headers=headers)
@router.get("/{document_id}", response_model=DocumentDetailResponse)
def get_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentDetailResponse:
"""Returns one document by unique identifier."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
return DocumentDetailResponse.model_validate(document)
@router.get("/{document_id}/download")
def download_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
"""Downloads original document bytes for the requested document identifier."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
file_path = absolute_path(document.stored_relative_path)
return FileResponse(path=file_path, filename=document.original_filename, media_type=document.mime_type)
@router.get("/{document_id}/preview")
def preview_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
"""Streams the original document inline when browser rendering is supported."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
original_path = absolute_path(document.stored_relative_path)
return FileResponse(path=original_path, media_type=document.mime_type)
@router.get("/{document_id}/thumbnail")
def thumbnail_document(document_id: UUID, session: Session = Depends(get_session)) -> FileResponse:
"""Returns a generated thumbnail image for dashboard card previews."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
preview_relative_path = document.metadata_json.get("preview_relative_path")
if not preview_relative_path:
raise HTTPException(status_code=404, detail="Thumbnail not available")
preview_path = absolute_path(preview_relative_path)
if not preview_path.exists():
raise HTTPException(status_code=404, detail="Thumbnail file not found")
return FileResponse(path=preview_path)
@router.get("/{document_id}/content-md")
def download_document_content_markdown(document_id: UUID, session: Session = Depends(get_session)) -> Response:
"""Downloads extracted content for one document as a markdown file."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
markdown_content = _markdown_for_document(document)
filename = _markdown_filename(document)
headers = {"Content-Disposition": f'attachment; filename="{filename}"'}
return Response(content=markdown_content, media_type="text/markdown; charset=utf-8", headers=headers)
@router.post("/upload", response_model=UploadResponse)
async def upload_documents(
files: Annotated[list[UploadFile], File(description="Files to upload")],
relative_paths: Annotated[list[str] | None, Form()] = None,
logical_path: Annotated[str, Form()] = "Inbox",
tags: Annotated[str | None, Form()] = None,
conflict_mode: Annotated[Literal["ask", "replace", "duplicate"], Form()] = "ask",
session: Session = Depends(get_session),
) -> UploadResponse:
"""Uploads files, records metadata, and enqueues asynchronous extraction tasks."""
set_processing_log_autocommit(session, True)
normalized_tags = _normalize_tags(tags)
queue = get_processing_queue()
uploaded: list[DocumentResponse] = []
conflicts: list[UploadConflict] = []
indexed_relative_paths = relative_paths or []
prepared_uploads: list[dict[str, object]] = []
for idx, file in enumerate(files):
filename = file.filename or f"uploaded_{idx}"
data = await file.read()
sha256 = compute_sha256(data)
source_relative_path = indexed_relative_paths[idx] if idx < len(indexed_relative_paths) else filename
extension = Path(filename).suffix.lower()
detected_mime = sniff_mime(data)
log_processing_event(
session=session,
stage="upload",
event="Upload request received",
level="info",
document_filename=filename,
payload_json={
"source_relative_path": source_relative_path,
"logical_path": logical_path,
"tags": normalized_tags,
"mime_type": detected_mime,
"size_bytes": len(data),
"conflict_mode": conflict_mode,
},
)
prepared_uploads.append(
{
"filename": filename,
"data": data,
"sha256": sha256,
"source_relative_path": source_relative_path,
"extension": extension,
"mime_type": detected_mime,
}
)
existing = session.execute(select(Document).where(Document.sha256 == sha256)).scalar_one_or_none()
if existing and conflict_mode == "ask":
log_processing_event(
session=session,
stage="upload",
event="Upload conflict detected",
level="warning",
document_id=existing.id,
document_filename=filename,
payload_json={
"sha256": sha256,
"existing_document_id": str(existing.id),
},
)
conflicts.append(
UploadConflict(
original_filename=filename,
sha256=sha256,
existing_document_id=existing.id,
)
)
if conflicts and conflict_mode == "ask":
session.commit()
return UploadResponse(uploaded=[], conflicts=conflicts)
for prepared in prepared_uploads:
existing = session.execute(
select(Document).where(Document.sha256 == str(prepared["sha256"]))
).scalar_one_or_none()
replaces_document_id = existing.id if existing and conflict_mode == "replace" else None
stored_relative_path = store_bytes(str(prepared["filename"]), bytes(prepared["data"]))
document = Document(
original_filename=str(prepared["filename"]),
source_relative_path=str(prepared["source_relative_path"]),
stored_relative_path=stored_relative_path,
mime_type=str(prepared["mime_type"]),
extension=str(prepared["extension"]),
sha256=str(prepared["sha256"]),
size_bytes=len(bytes(prepared["data"])),
logical_path=logical_path,
tags=list(normalized_tags),
replaces_document_id=replaces_document_id,
metadata_json={"upload": "web"},
)
session.add(document)
session.flush()
queue.enqueue("app.worker.tasks.process_document_task", str(document.id))
log_processing_event(
session=session,
stage="upload",
event="Document record created and queued",
level="info",
document=document,
payload_json={
"source_relative_path": document.source_relative_path,
"stored_relative_path": document.stored_relative_path,
"logical_path": document.logical_path,
"tags": list(document.tags),
"replaces_document_id": str(replaces_document_id) if replaces_document_id is not None else None,
},
)
uploaded.append(DocumentResponse.model_validate(document))
session.commit()
return UploadResponse(uploaded=uploaded, conflicts=conflicts)
@router.patch("/{document_id}", response_model=DocumentResponse)
def update_document(
document_id: UUID,
payload: DocumentUpdateRequest,
session: Session = Depends(get_session),
) -> DocumentResponse:
"""Updates document metadata and refreshes semantic index representation."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if payload.original_filename is not None:
document.original_filename = _sanitize_filename(payload.original_filename)
if payload.logical_path is not None:
document.logical_path = payload.logical_path.strip() or "Inbox"
if payload.tags is not None:
document.tags = list(dict.fromkeys([tag.strip() for tag in payload.tags if tag.strip()]))[:50]
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
session.refresh(document)
return DocumentResponse.model_validate(document)
@router.post("/{document_id}/trash", response_model=DocumentResponse)
def trash_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
"""Marks a document as trashed without deleting files from storage."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if document.status != DocumentStatus.TRASHED:
document.metadata_json = {
**document.metadata_json,
"status_before_trash": document.status.value,
}
document.status = DocumentStatus.TRASHED
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
session.refresh(document)
return DocumentResponse.model_validate(document)
@router.post("/{document_id}/restore", response_model=DocumentResponse)
def restore_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
"""Restores a trashed document to its previous lifecycle status."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if document.status == DocumentStatus.TRASHED:
fallback = DocumentStatus.PROCESSED if document.processed_at else DocumentStatus.QUEUED
restored_status = _resolve_previous_status(document.metadata_json, fallback)
document.status = restored_status
metadata_json = dict(document.metadata_json)
metadata_json.pop("status_before_trash", None)
document.metadata_json = metadata_json
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
session.refresh(document)
return DocumentResponse.model_validate(document)
@router.delete("/{document_id}")
def delete_document(document_id: UUID, session: Session = Depends(get_session)) -> dict[str, int]:
"""Permanently deletes a document and all descendant archive members including stored files."""
root = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if root is None:
raise HTTPException(status_code=404, detail="Document not found")
if root.status != DocumentStatus.TRASHED:
raise HTTPException(status_code=400, detail="Move document to trash before permanent deletion")
document_tree = _collect_document_tree(session=session, root_document_id=document_id)
document_ids = [document.id for _, document in document_tree]
try:
delete_many_documents_index([str(current_id) for current_id in document_ids])
except Exception:
pass
try:
delete_many_handwriting_style_documents([str(current_id) for current_id in document_ids])
except Exception:
pass
deleted_files = 0
for _, document in document_tree:
source_path = absolute_path(document.stored_relative_path)
if source_path.exists() and source_path.is_file():
source_path.unlink(missing_ok=True)
deleted_files += 1
preview_relative_path = document.metadata_json.get("preview_relative_path")
if isinstance(preview_relative_path, str):
preview_path = absolute_path(preview_relative_path)
if preview_path.exists() and preview_path.is_file():
preview_path.unlink(missing_ok=True)
session.delete(document)
session.commit()
return {"deleted_documents": len(document_tree), "deleted_files": deleted_files}
@router.post("/{document_id}/reprocess", response_model=DocumentResponse)
def reprocess_document(document_id: UUID, session: Session = Depends(get_session)) -> DocumentResponse:
"""Re-enqueues a document for extraction and suggestion processing."""
document = session.execute(select(Document).where(Document.id == document_id)).scalar_one_or_none()
if document is None:
raise HTTPException(status_code=404, detail="Document not found")
if document.status == DocumentStatus.TRASHED:
raise HTTPException(status_code=400, detail="Restore document before reprocessing")
queue = get_processing_queue()
document.status = DocumentStatus.QUEUED
try:
upsert_document_index(document=document, summary_text=_summary_for_index(document))
except Exception:
pass
session.commit()
queue.enqueue("app.worker.tasks.process_document_task", str(document.id))
session.refresh(document)
return DocumentResponse.model_validate(document)