Files
ledgerdock/backend/app/models/processing_log.py

152 lines
5.4 KiB
Python

"""Data model representing one persisted processing pipeline log entry."""
import uuid
from datetime import UTC, datetime
import re
from typing import Any
from sqlalchemy import BigInteger, DateTime, ForeignKey, String, Text
from sqlalchemy.dialects.postgresql import JSONB, UUID
from sqlalchemy.orm import Mapped, mapped_column, validates
from app.core.config import get_settings
from app.db.base import Base
settings = get_settings()
SENSITIVE_KEY_MARKERS = (
"api_key",
"apikey",
"authorization",
"bearer",
"token",
"secret",
"password",
"credential",
"cookie",
)
SENSITIVE_TEXT_PATTERNS = (
re.compile(r"(?i)\bauthorization\b\s*[:=]\s*bearer\s+[a-z0-9._~+/\-]+=*"),
re.compile(r"(?i)\bbearer\s+[a-z0-9._~+/\-]+=*"),
re.compile(r"\b[a-z0-9_-]{8,}\.[a-z0-9_-]{8,}\.[a-z0-9_-]{8,}\b", flags=re.IGNORECASE),
re.compile(r"(?i)\bsk-[a-z0-9]{16,}\b"),
re.compile(r"(?i)\b(api[_-]?key|token|secret|password)\b\s*[:=]\s*['\"]?[^\s,'\";]+['\"]?"),
)
REDACTED_TEXT = "[REDACTED]"
MAX_PAYLOAD_KEYS = 80
MAX_PAYLOAD_LIST_ITEMS = 80
def _truncate(value: str, limit: int) -> str:
"""Truncates long log fields to configured bounds with stable suffix marker."""
normalized = value.strip()
if len(normalized) <= limit:
return normalized
return normalized[: max(0, limit - 3)] + "..."
def _is_sensitive_key(key: str) -> bool:
"""Returns whether a payload key likely contains sensitive credential data."""
normalized = key.strip().lower()
return any(marker in normalized for marker in SENSITIVE_KEY_MARKERS)
def _redact_sensitive_text(value: str) -> str:
"""Redacts token-like segments from log text while retaining non-sensitive context."""
redacted = value
for pattern in SENSITIVE_TEXT_PATTERNS:
redacted = pattern.sub(lambda _: REDACTED_TEXT, redacted)
return redacted
def sanitize_processing_log_payload_value(value: Any, *, parent_key: str | None = None) -> Any:
"""Sanitizes payload structures by redacting sensitive fields and bounding size."""
if parent_key and _is_sensitive_key(parent_key):
return REDACTED_TEXT
if isinstance(value, dict):
sanitized: dict[str, Any] = {}
for index, (raw_key, raw_value) in enumerate(value.items()):
if index >= MAX_PAYLOAD_KEYS:
break
key = str(raw_key)
sanitized[key] = sanitize_processing_log_payload_value(raw_value, parent_key=key)
return sanitized
if isinstance(value, list):
return [
sanitize_processing_log_payload_value(item, parent_key=parent_key)
for item in value[:MAX_PAYLOAD_LIST_ITEMS]
]
if isinstance(value, tuple):
return [
sanitize_processing_log_payload_value(item, parent_key=parent_key)
for item in list(value)[:MAX_PAYLOAD_LIST_ITEMS]
]
if isinstance(value, str):
redacted = _redact_sensitive_text(value)
return _truncate(redacted, settings.processing_log_max_payload_chars)
if isinstance(value, (int, float, bool)) or value is None:
return value
as_text = _truncate(str(value), settings.processing_log_max_payload_chars)
return _redact_sensitive_text(as_text)
def sanitize_processing_log_text(value: str | None) -> str | None:
"""Sanitizes prompt and response fields by redacting credentials and clamping length."""
if value is None:
return None
normalized = value.strip()
if not normalized:
return None
redacted = _redact_sensitive_text(normalized)
return _truncate(redacted, settings.processing_log_max_text_chars)
class ProcessingLogEntry(Base):
"""Stores a timestamped processing event with optional model prompt and response text."""
__tablename__ = "processing_logs"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, default=lambda: datetime.now(UTC))
level: Mapped[str] = mapped_column(String(16), nullable=False, default="info")
stage: Mapped[str] = mapped_column(String(64), nullable=False)
event: Mapped[str] = mapped_column(String(256), nullable=False)
document_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True),
ForeignKey("documents.id", ondelete="SET NULL"),
nullable=True,
)
document_filename: Mapped[str | None] = mapped_column(String(512), nullable=True)
provider_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
model_name: Mapped[str | None] = mapped_column(String(256), nullable=True)
prompt_text: Mapped[str | None] = mapped_column(Text, nullable=True)
response_text: Mapped[str | None] = mapped_column(Text, nullable=True)
payload_json: Mapped[dict] = mapped_column(JSONB, nullable=False, default=dict)
@validates("prompt_text", "response_text")
def _validate_text_fields(self, key: str, value: str | None) -> str | None:
"""Redacts and bounds free-text log fields before persistence."""
return sanitize_processing_log_text(value)
@validates("payload_json")
def _validate_payload_json(self, key: str, value: dict[str, Any] | None) -> dict[str, Any]:
"""Redacts and bounds structured payload fields before persistence."""
if not isinstance(value, dict):
return {}
return sanitize_processing_log_payload_value(value)