Harden security controls from REPORT findings

This commit is contained in:
2026-03-01 13:32:08 -03:00
parent da5cbc2c01
commit bdd97d1c62
20 changed files with 1455 additions and 97 deletions

View File

@@ -3,16 +3,17 @@
from redis import Redis
from rq import Queue
from app.core.config import get_settings
from app.core.config import get_settings, validate_redis_url_security
settings = get_settings()
def get_redis() -> Redis:
"""Creates a Redis connection from configured URL."""
"""Creates a Redis connection after enforcing URL security policy checks."""
return Redis.from_url(settings.redis_url)
secure_redis_url = validate_redis_url_security(settings.redis_url)
return Redis.from_url(secure_redis_url)
def get_processing_queue() -> Queue:

View File

@@ -7,6 +7,7 @@ from pathlib import Path
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.core.config import get_settings
from app.db.base import SessionLocal
from app.models.document import Document, DocumentStatus
from app.services.app_settings import (
@@ -37,6 +38,13 @@ from app.services.storage import absolute_path, compute_sha256, store_bytes, wri
from app.worker.queue import get_processing_queue
settings = get_settings()
ARCHIVE_ROOT_ID_METADATA_KEY = "archive_root_document_id"
ARCHIVE_DEPTH_METADATA_KEY = "archive_depth"
ARCHIVE_DESCENDANT_COUNT_METADATA_KEY = "archive_descendant_count"
def _cleanup_processing_logs_with_settings(session: Session) -> None:
"""Applies configured processing log retention while trimming old log entries."""
@@ -48,13 +56,80 @@ def _cleanup_processing_logs_with_settings(session: Session) -> None:
)
def _metadata_non_negative_int(value: object, fallback: int = 0) -> int:
"""Parses metadata values as non-negative integers with safe fallback behavior."""
try:
parsed = int(value)
except (TypeError, ValueError):
return fallback
return max(0, parsed)
def _metadata_uuid(value: object) -> uuid.UUID | None:
"""Parses metadata values as UUIDs while tolerating malformed legacy values."""
if not isinstance(value, str) or not value.strip():
return None
try:
return uuid.UUID(value.strip())
except ValueError:
return None
def _resolve_archive_lineage(session: Session, document: Document) -> tuple[uuid.UUID, int]:
"""Resolves archive root document id and depth for metadata propagation compatibility."""
metadata_json = dict(document.metadata_json)
metadata_root = _metadata_uuid(metadata_json.get(ARCHIVE_ROOT_ID_METADATA_KEY))
metadata_depth = _metadata_non_negative_int(metadata_json.get(ARCHIVE_DEPTH_METADATA_KEY), fallback=0)
if metadata_root is not None:
return metadata_root, metadata_depth
if not document.is_archive_member:
return document.id, 0
depth = 0
root_document_id = document.id
parent_document_id = document.parent_document_id
visited: set[uuid.UUID] = {document.id}
while parent_document_id is not None and parent_document_id not in visited:
visited.add(parent_document_id)
parent_document = session.execute(select(Document).where(Document.id == parent_document_id)).scalar_one_or_none()
if parent_document is None:
break
depth += 1
root_document_id = parent_document.id
parent_document_id = parent_document.parent_document_id
return root_document_id, depth
def _merge_archive_metadata(document: Document, **updates: object) -> None:
"""Applies archive metadata updates while preserving unrelated document metadata keys."""
metadata_json = dict(document.metadata_json)
metadata_json.update(updates)
document.metadata_json = metadata_json
def _load_archive_root_for_update(session: Session, root_document_id: uuid.UUID) -> Document | None:
"""Loads archive root row with write lock to serialize descendant-count budget updates."""
return session.execute(
select(Document).where(Document.id == root_document_id).with_for_update()
).scalar_one_or_none()
def _create_archive_member_document(
parent: Document,
member_name: str,
member_data: bytes,
mime_type: str,
archive_root_document_id: uuid.UUID,
archive_depth: int,
) -> Document:
"""Creates a child document entity for a file extracted from an uploaded archive."""
"""Creates child document entities with lineage metadata for recursive archive processing."""
extension = Path(member_name).suffix.lower()
stored_relative_path = store_bytes(member_name, member_data)
@@ -68,7 +143,12 @@ def _create_archive_member_document(
size_bytes=len(member_data),
logical_path=parent.logical_path,
tags=list(parent.tags),
metadata_json={"origin": "archive", "parent": str(parent.id)},
metadata_json={
"origin": "archive",
"parent": str(parent.id),
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
},
is_archive_member=True,
archived_member_path=member_name,
parent_document_id=parent.id,
@@ -110,16 +190,46 @@ def process_document_task(document_id: str) -> None:
if document.extension == ".zip":
child_ids: list[str] = []
archive_root_document_id, archive_depth = _resolve_archive_lineage(session=session, document=document)
_merge_archive_metadata(
document,
**{
ARCHIVE_ROOT_ID_METADATA_KEY: str(archive_root_document_id),
ARCHIVE_DEPTH_METADATA_KEY: archive_depth,
},
)
root_document = _load_archive_root_for_update(session=session, root_document_id=archive_root_document_id)
if root_document is None:
root_document = document
root_metadata_json = dict(root_document.metadata_json)
existing_descendant_count = _metadata_non_negative_int(
root_metadata_json.get(ARCHIVE_DESCENDANT_COUNT_METADATA_KEY),
fallback=0,
)
max_descendants_per_root = max(0, int(settings.max_zip_descendants_per_root))
remaining_descendant_budget = max(0, max_descendants_per_root - existing_descendant_count)
extraction_member_cap = remaining_descendant_budget
log_processing_event(
session=session,
stage="archive",
event="Archive extraction started",
level="info",
document=document,
payload_json={"size_bytes": len(data)},
payload_json={
"size_bytes": len(data),
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth,
"remaining_descendant_budget": remaining_descendant_budget,
},
)
try:
members = extract_archive_members(data)
members = extract_archive_members(
data,
depth=archive_depth,
max_members=extraction_member_cap,
)
for member in members:
mime_type = sniff_mime(member.data)
child = _create_archive_member_document(
@@ -127,6 +237,8 @@ def process_document_task(document_id: str) -> None:
member_name=member.name,
member_data=member.data,
mime_type=mime_type,
archive_root_document_id=archive_root_document_id,
archive_depth=archive_depth + 1,
)
session.add(child)
session.flush()
@@ -142,8 +254,27 @@ def process_document_task(document_id: str) -> None:
"member_name": member.name,
"member_size_bytes": len(member.data),
"mime_type": mime_type,
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth + 1,
},
)
updated_root_metadata = dict(root_document.metadata_json)
updated_root_metadata[ARCHIVE_ROOT_ID_METADATA_KEY] = str(archive_root_document_id)
updated_root_metadata[ARCHIVE_DEPTH_METADATA_KEY] = 0
updated_root_metadata[ARCHIVE_DESCENDANT_COUNT_METADATA_KEY] = existing_descendant_count + len(child_ids)
root_document.metadata_json = updated_root_metadata
limit_flags: dict[str, object] = {}
if archive_depth >= settings.max_zip_depth:
limit_flags["max_depth_reached"] = True
if remaining_descendant_budget <= 0:
limit_flags["max_descendants_reached"] = True
elif len(child_ids) >= remaining_descendant_budget:
limit_flags["max_descendants_reached"] = True
if limit_flags:
_merge_archive_metadata(document, **limit_flags)
document.status = DocumentStatus.PROCESSED
document.extracted_text = f"archive with {len(members)} files"
log_processing_event(
@@ -152,7 +283,13 @@ def process_document_task(document_id: str) -> None:
event="Archive extraction completed",
level="info",
document=document,
payload_json={"member_count": len(members)},
payload_json={
"member_count": len(members),
"archive_root_document_id": str(archive_root_document_id),
"archive_depth": archive_depth,
"descendant_count": existing_descendant_count + len(child_ids),
"remaining_descendant_budget": max(0, remaining_descendant_budget - len(child_ids)),
},
)
except Exception as exc:
document.status = DocumentStatus.ERROR
@@ -231,7 +368,10 @@ def process_document_task(document_id: str) -> None:
event="Archive child job enqueued",
level="info",
document_id=uuid.UUID(child_id),
payload_json={"parent_document_id": str(document.id)},
payload_json={
"parent_document_id": str(document.id),
"archive_root_document_id": str(archive_root_document_id),
},
)
session.commit()
return