Files
ledgerdock/backend/app/services/processing_logs.py
2026-02-21 09:44:18 -03:00

193 lines
6.8 KiB
Python

"""Persistence helpers for writing and querying processing pipeline log events."""
from typing import Any
from uuid import UUID
from sqlalchemy import delete, func, select
from sqlalchemy.orm import Session
from app.models.document import Document
from app.models.processing_log import ProcessingLogEntry
MAX_STAGE_LENGTH = 64
MAX_EVENT_LENGTH = 256
MAX_LEVEL_LENGTH = 16
MAX_PROVIDER_LENGTH = 128
MAX_MODEL_LENGTH = 256
MAX_DOCUMENT_FILENAME_LENGTH = 512
MAX_PROMPT_LENGTH = 200000
MAX_RESPONSE_LENGTH = 200000
DEFAULT_KEEP_DOCUMENT_SESSIONS = 2
DEFAULT_KEEP_UNBOUND_ENTRIES = 80
PROCESSING_LOG_AUTOCOMMIT_SESSION_KEY = "processing_log_autocommit"
def _trim(value: str | None, max_length: int) -> str | None:
"""Normalizes and truncates text values for safe log persistence."""
if value is None:
return None
normalized = value.strip()
if not normalized:
return None
if len(normalized) <= max_length:
return normalized
return normalized[: max_length - 3] + "..."
def _safe_payload(payload_json: dict[str, Any] | None) -> dict[str, Any]:
"""Ensures payload values are persisted as dictionaries."""
return payload_json if isinstance(payload_json, dict) else {}
def set_processing_log_autocommit(session: Session, enabled: bool) -> None:
"""Toggles per-session immediate commit behavior for processing log events."""
session.info[PROCESSING_LOG_AUTOCOMMIT_SESSION_KEY] = bool(enabled)
def is_processing_log_autocommit_enabled(session: Session) -> bool:
"""Returns whether processing logs are committed immediately for the current session."""
return bool(session.info.get(PROCESSING_LOG_AUTOCOMMIT_SESSION_KEY, False))
def log_processing_event(
session: Session,
stage: str,
event: str,
*,
level: str = "info",
document: Document | None = None,
document_id: UUID | None = None,
document_filename: str | None = None,
provider_id: str | None = None,
model_name: str | None = None,
prompt_text: str | None = None,
response_text: str | None = None,
payload_json: dict[str, Any] | None = None,
) -> None:
"""Persists one processing log entry linked to an optional document context."""
resolved_document_id = document.id if document is not None else document_id
resolved_document_filename = document.original_filename if document is not None else document_filename
entry = ProcessingLogEntry(
level=_trim(level, MAX_LEVEL_LENGTH) or "info",
stage=_trim(stage, MAX_STAGE_LENGTH) or "pipeline",
event=_trim(event, MAX_EVENT_LENGTH) or "event",
document_id=resolved_document_id,
document_filename=_trim(resolved_document_filename, MAX_DOCUMENT_FILENAME_LENGTH),
provider_id=_trim(provider_id, MAX_PROVIDER_LENGTH),
model_name=_trim(model_name, MAX_MODEL_LENGTH),
prompt_text=_trim(prompt_text, MAX_PROMPT_LENGTH),
response_text=_trim(response_text, MAX_RESPONSE_LENGTH),
payload_json=_safe_payload(payload_json),
)
session.add(entry)
if is_processing_log_autocommit_enabled(session):
session.commit()
def count_processing_logs(session: Session, document_id: UUID | None = None) -> int:
"""Counts persisted processing logs, optionally restricted to one document."""
statement = select(func.count()).select_from(ProcessingLogEntry)
if document_id is not None:
statement = statement.where(ProcessingLogEntry.document_id == document_id)
return int(session.execute(statement).scalar_one())
def list_processing_logs(
session: Session,
*,
limit: int,
offset: int,
document_id: UUID | None = None,
) -> list[ProcessingLogEntry]:
"""Lists processing logs ordered by newest-first with optional document filter."""
statement = select(ProcessingLogEntry)
if document_id is not None:
statement = statement.where(ProcessingLogEntry.document_id == document_id)
statement = statement.order_by(ProcessingLogEntry.created_at.desc(), ProcessingLogEntry.id.desc()).offset(offset).limit(limit)
return session.execute(statement).scalars().all()
def cleanup_processing_logs(
session: Session,
*,
keep_document_sessions: int = DEFAULT_KEEP_DOCUMENT_SESSIONS,
keep_unbound_entries: int = DEFAULT_KEEP_UNBOUND_ENTRIES,
) -> dict[str, int]:
"""Deletes old log entries while keeping recent document sessions and unbound events."""
normalized_keep_sessions = max(0, keep_document_sessions)
normalized_keep_unbound = max(0, keep_unbound_entries)
deleted_document_entries = 0
deleted_unbound_entries = 0
recent_document_rows = session.execute(
select(
ProcessingLogEntry.document_id,
func.max(ProcessingLogEntry.created_at).label("last_seen"),
)
.where(ProcessingLogEntry.document_id.is_not(None))
.group_by(ProcessingLogEntry.document_id)
.order_by(func.max(ProcessingLogEntry.created_at).desc())
.limit(normalized_keep_sessions)
).all()
keep_document_ids = [row[0] for row in recent_document_rows if row[0] is not None]
if keep_document_ids:
deleted_document_entries = int(
session.execute(
delete(ProcessingLogEntry).where(
ProcessingLogEntry.document_id.is_not(None),
ProcessingLogEntry.document_id.notin_(keep_document_ids),
)
).rowcount
or 0
)
else:
deleted_document_entries = int(
session.execute(delete(ProcessingLogEntry).where(ProcessingLogEntry.document_id.is_not(None))).rowcount or 0
)
keep_unbound_rows = session.execute(
select(ProcessingLogEntry.id)
.where(ProcessingLogEntry.document_id.is_(None))
.order_by(ProcessingLogEntry.created_at.desc(), ProcessingLogEntry.id.desc())
.limit(normalized_keep_unbound)
).all()
keep_unbound_ids = [row[0] for row in keep_unbound_rows]
if keep_unbound_ids:
deleted_unbound_entries = int(
session.execute(
delete(ProcessingLogEntry).where(
ProcessingLogEntry.document_id.is_(None),
ProcessingLogEntry.id.notin_(keep_unbound_ids),
)
).rowcount
or 0
)
else:
deleted_unbound_entries = int(
session.execute(delete(ProcessingLogEntry).where(ProcessingLogEntry.document_id.is_(None))).rowcount or 0
)
return {
"deleted_document_entries": deleted_document_entries,
"deleted_unbound_entries": deleted_unbound_entries,
}
def clear_processing_logs(session: Session) -> dict[str, int]:
"""Deletes all persisted processing log entries and returns deletion count."""
deleted_entries = int(session.execute(delete(ProcessingLogEntry)).rowcount or 0)
return {"deleted_entries": deleted_entries}