Harden security controls from REPORT findings
This commit is contained in:
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.db.base import SessionLocal
|
||||
from app.models.document import Document, DocumentStatus
|
||||
from app.services.app_settings import (
|
||||
@@ -37,6 +38,13 @@ from app.services.storage import absolute_path, compute_sha256, store_bytes, wri
|
||||
from app.worker.queue import get_processing_queue
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
ARCHIVE_ROOT_ID_METADATA_KEY = "archive_root_document_id"
|
||||
ARCHIVE_DEPTH_METADATA_KEY = "archive_depth"
|
||||
ARCHIVE_DESCENDANT_COUNT_METADATA_KEY = "archive_descendant_count"
|
||||
|
||||
|
||||
def _cleanup_processing_logs_with_settings(session: Session) -> None:
|
||||
"""Applies configured processing log retention while trimming old log entries."""
|
||||
|
||||
@@ -48,13 +56,80 @@ def _cleanup_processing_logs_with_settings(session: Session) -> None:
|
||||
)
|
||||
|
||||
|
||||
def _metadata_non_negative_int(value: object, fallback: int = 0) -> int:
|
||||
"""Parses metadata values as non-negative integers with safe fallback behavior."""
|
||||
|
||||
try:
|
||||
parsed = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return fallback
|
||||
return max(0, parsed)
|
||||
|
||||
|
||||
def _metadata_uuid(value: object) -> uuid.UUID | None:
|
||||
"""Parses metadata values as UUIDs while tolerating malformed legacy values."""
|
||||
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
return None
|
||||
try:
|
||||
return uuid.UUID(value.strip())
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_archive_lineage(session: Session, document: Document) -> tuple[uuid.UUID, int]:
|
||||
"""Resolves archive root document id and depth for metadata propagation compatibility."""
|
||||
|
||||
metadata_json = dict(document.metadata_json)
|
||||
metadata_root = _metadata_uuid(metadata_json.get(ARCHIVE_ROOT_ID_METADATA_KEY))
|
||||
metadata_depth = _metadata_non_negative_int(metadata_json.get(ARCHIVE_DEPTH_METADATA_KEY), fallback=0)
|
||||
if metadata_root is not None:
|
||||
return metadata_root, metadata_depth
|
||||
|
||||
if not document.is_archive_member:
|
||||
return document.id, 0
|
||||
|
||||
depth = 0
|
||||
root_document_id = document.id
|
||||
parent_document_id = document.parent_document_id
|
||||
visited: set[uuid.UUID] = {document.id}
|
||||
while parent_document_id is not None and parent_document_id not in visited:
|
||||
visited.add(parent_document_id)
|
||||
parent_document = session.execute(select(Document).where(Document.id == parent_document_id)).scalar_one_or_none()
|
||||
if parent_document is None:
|
||||
break
|
||||
depth += 1
|
||||
root_document_id = parent_document.id
|
||||
parent_document_id = parent_document.parent_document_id
|
||||
|
||||
return root_document_id, depth
|
||||
|
||||
|
||||
def _merge_archive_metadata(document: Document, **updates: object) -> None:
|
||||
"""Applies archive metadata updates while preserving unrelated document metadata keys."""
|
||||
|
||||
metadata_json = dict(document.metadata_json)
|
||||
metadata_json.update(updates)
|
||||
document.metadata_json = metadata_json
|
||||
|
||||
|
||||
def _load_archive_root_for_update(session: Session, root_document_id: uuid.UUID) -> Document | None:
|
||||
"""Loads archive root row with write lock to serialize descendant-count budget updates."""
|
||||
|
||||
return session.execute(
|
||||
select(Document).where(Document.id == root_document_id).with_for_update()
|
||||
).scalar_one_or_none()
|
||||
|
||||
|
||||
def _create_archive_member_document(
|
||||
parent: Document,
|
||||
member_name: str,
|
||||
member_data: bytes,
|
||||
mime_type: str,
|
||||
archive_root_document_id: uuid.UUID,
|
||||
archive_depth: int,
|
||||
) -> Document:
|
||||
"""Creates a child document entity for a file extracted from an uploaded archive."""
|
||||
"""Creates child document entities with lineage metadata for recursive archive processing."""
|
||||
|
||||
extension = Path(member_name).suffix.lower()
|
||||
stored_relative_path = store_bytes(member_name, member_data)
|
||||
@@ -68,7 +143,12 @@ def _create_archive_member_document(
|
||||
size_bytes=len(member_data),
|
||||
logical_path=parent.logical_path,
|
||||
tags=list(parent.tags),
|
||||
metadata_json={"origin": "archive", "parent": str(parent.id)},
|
||||
metadata_json={
|
||||
"origin": "archive",
|
||||
"parent": str(parent.id),
|
||||
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
|
||||
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
|
||||
},
|
||||
is_archive_member=True,
|
||||
archived_member_path=member_name,
|
||||
parent_document_id=parent.id,
|
||||
@@ -110,16 +190,46 @@ def process_document_task(document_id: str) -> None:
|
||||
|
||||
if document.extension == ".zip":
|
||||
child_ids: list[str] = []
|
||||
archive_root_document_id, archive_depth = _resolve_archive_lineage(session=session, document=document)
|
||||
_merge_archive_metadata(
|
||||
document,
|
||||
**{
|
||||
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
|
||||
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
|
||||
},
|
||||
)
|
||||
root_document = _load_archive_root_for_update(session=session, root_document_id=archive_root_document_id)
|
||||
if root_document is None:
|
||||
root_document = document
|
||||
|
||||
root_metadata_json = dict(root_document.metadata_json)
|
||||
existing_descendant_count = _metadata_non_negative_int(
|
||||
root_metadata_json.get(ARCHIVE_DESCENDANT_COUNT_METADATA_KEY),
|
||||
fallback=0,
|
||||
)
|
||||
max_descendants_per_root = max(0, int(settings.max_zip_descendants_per_root))
|
||||
remaining_descendant_budget = max(0, max_descendants_per_root - existing_descendant_count)
|
||||
extraction_member_cap = remaining_descendant_budget
|
||||
|
||||
log_processing_event(
|
||||
session=session,
|
||||
stage="archive",
|
||||
event="Archive extraction started",
|
||||
level="info",
|
||||
document=document,
|
||||
payload_json={"size_bytes": len(data)},
|
||||
payload_json={
|
||||
"size_bytes": len(data),
|
||||
"archive_root_document_id": str(archive_root_document_id),
|
||||
"archive_depth": archive_depth,
|
||||
"remaining_descendant_budget": remaining_descendant_budget,
|
||||
},
|
||||
)
|
||||
try:
|
||||
members = extract_archive_members(data)
|
||||
members = extract_archive_members(
|
||||
data,
|
||||
depth=archive_depth,
|
||||
max_members=extraction_member_cap,
|
||||
)
|
||||
for member in members:
|
||||
mime_type = sniff_mime(member.data)
|
||||
child = _create_archive_member_document(
|
||||
@@ -127,6 +237,8 @@ def process_document_task(document_id: str) -> None:
|
||||
member_name=member.name,
|
||||
member_data=member.data,
|
||||
mime_type=mime_type,
|
||||
archive_root_document_id=archive_root_document_id,
|
||||
archive_depth=archive_depth + 1,
|
||||
)
|
||||
session.add(child)
|
||||
session.flush()
|
||||
@@ -142,8 +254,27 @@ def process_document_task(document_id: str) -> None:
|
||||
"member_name": member.name,
|
||||
"member_size_bytes": len(member.data),
|
||||
"mime_type": mime_type,
|
||||
"archive_root_document_id": str(archive_root_document_id),
|
||||
"archive_depth": archive_depth + 1,
|
||||
},
|
||||
)
|
||||
|
||||
updated_root_metadata = dict(root_document.metadata_json)
|
||||
updated_root_metadata[ARCHIVE_ROOT_ID_METADATA_KEY] = str(archive_root_document_id)
|
||||
updated_root_metadata[ARCHIVE_DEPTH_METADATA_KEY] = 0
|
||||
updated_root_metadata[ARCHIVE_DESCENDANT_COUNT_METADATA_KEY] = existing_descendant_count + len(child_ids)
|
||||
root_document.metadata_json = updated_root_metadata
|
||||
|
||||
limit_flags: dict[str, object] = {}
|
||||
if archive_depth >= settings.max_zip_depth:
|
||||
limit_flags["max_depth_reached"] = True
|
||||
if remaining_descendant_budget <= 0:
|
||||
limit_flags["max_descendants_reached"] = True
|
||||
elif len(child_ids) >= remaining_descendant_budget:
|
||||
limit_flags["max_descendants_reached"] = True
|
||||
if limit_flags:
|
||||
_merge_archive_metadata(document, **limit_flags)
|
||||
|
||||
document.status = DocumentStatus.PROCESSED
|
||||
document.extracted_text = f"archive with {len(members)} files"
|
||||
log_processing_event(
|
||||
@@ -152,7 +283,13 @@ def process_document_task(document_id: str) -> None:
|
||||
event="Archive extraction completed",
|
||||
level="info",
|
||||
document=document,
|
||||
payload_json={"member_count": len(members)},
|
||||
payload_json={
|
||||
"member_count": len(members),
|
||||
"archive_root_document_id": str(archive_root_document_id),
|
||||
"archive_depth": archive_depth,
|
||||
"descendant_count": existing_descendant_count + len(child_ids),
|
||||
"remaining_descendant_budget": max(0, remaining_descendant_budget - len(child_ids)),
|
||||
},
|
||||
)
|
||||
except Exception as exc:
|
||||
document.status = DocumentStatus.ERROR
|
||||
@@ -231,7 +368,10 @@ def process_document_task(document_id: str) -> None:
|
||||
event="Archive child job enqueued",
|
||||
level="info",
|
||||
document_id=uuid.UUID(child_id),
|
||||
payload_json={"parent_document_id": str(document.id)},
|
||||
payload_json={
|
||||
"parent_document_id": str(document.id),
|
||||
"archive_root_document_id": str(archive_root_document_id),
|
||||
},
|
||||
)
|
||||
session.commit()
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user