"""Data model representing one persisted processing pipeline log entry.""" import uuid from datetime import UTC, datetime import re from typing import Any from sqlalchemy import BigInteger, DateTime, ForeignKey, String, Text from sqlalchemy.dialects.postgresql import JSONB, UUID from sqlalchemy.orm import Mapped, mapped_column, validates from app.core.config import get_settings from app.db.base import Base settings = get_settings() SENSITIVE_KEY_MARKERS = ( "api_key", "apikey", "authorization", "bearer", "token", "secret", "password", "credential", "cookie", ) SENSITIVE_TEXT_PATTERNS = ( re.compile(r"(?i)\bauthorization\b\s*[:=]\s*bearer\s+[a-z0-9._~+/\-]+=*"), re.compile(r"(?i)\bbearer\s+[a-z0-9._~+/\-]+=*"), re.compile(r"\b[a-z0-9_-]{8,}\.[a-z0-9_-]{8,}\.[a-z0-9_-]{8,}\b", flags=re.IGNORECASE), re.compile(r"(?i)\bsk-[a-z0-9]{16,}\b"), re.compile(r"(?i)\b(api[_-]?key|token|secret|password)\b\s*[:=]\s*['\"]?[^\s,'\";]+['\"]?"), ) REDACTED_TEXT = "[REDACTED]" MAX_PAYLOAD_KEYS = 80 MAX_PAYLOAD_LIST_ITEMS = 80 def _truncate(value: str, limit: int) -> str: """Truncates long log fields to configured bounds with stable suffix marker.""" normalized = value.strip() if len(normalized) <= limit: return normalized return normalized[: max(0, limit - 3)] + "..." def _is_sensitive_key(key: str) -> bool: """Returns whether a payload key likely contains sensitive credential data.""" normalized = key.strip().lower() return any(marker in normalized for marker in SENSITIVE_KEY_MARKERS) def _redact_sensitive_text(value: str) -> str: """Redacts token-like segments from log text while retaining non-sensitive context.""" redacted = value for pattern in SENSITIVE_TEXT_PATTERNS: redacted = pattern.sub(lambda _: REDACTED_TEXT, redacted) return redacted def sanitize_processing_log_payload_value(value: Any, *, parent_key: str | None = None) -> Any: """Sanitizes payload structures by redacting sensitive fields and bounding size.""" if parent_key and _is_sensitive_key(parent_key): return REDACTED_TEXT if isinstance(value, dict): sanitized: dict[str, Any] = {} for index, (raw_key, raw_value) in enumerate(value.items()): if index >= MAX_PAYLOAD_KEYS: break key = str(raw_key) sanitized[key] = sanitize_processing_log_payload_value(raw_value, parent_key=key) return sanitized if isinstance(value, list): return [ sanitize_processing_log_payload_value(item, parent_key=parent_key) for item in value[:MAX_PAYLOAD_LIST_ITEMS] ] if isinstance(value, tuple): return [ sanitize_processing_log_payload_value(item, parent_key=parent_key) for item in list(value)[:MAX_PAYLOAD_LIST_ITEMS] ] if isinstance(value, str): redacted = _redact_sensitive_text(value) return _truncate(redacted, settings.processing_log_max_payload_chars) if isinstance(value, (int, float, bool)) or value is None: return value as_text = _truncate(str(value), settings.processing_log_max_payload_chars) return _redact_sensitive_text(as_text) def sanitize_processing_log_text(value: str | None) -> str | None: """Sanitizes prompt and response fields by redacting credentials and clamping length.""" if value is None: return None normalized = value.strip() if not normalized: return None redacted = _redact_sensitive_text(normalized) return _truncate(redacted, settings.processing_log_max_text_chars) class ProcessingLogEntry(Base): """Stores a timestamped processing event with optional model prompt and response text.""" __tablename__ = "processing_logs" id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True) created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, default=lambda: datetime.now(UTC)) level: Mapped[str] = mapped_column(String(16), nullable=False, default="info") stage: Mapped[str] = mapped_column(String(64), nullable=False) event: Mapped[str] = mapped_column(String(256), nullable=False) document_id: Mapped[uuid.UUID | None] = mapped_column( UUID(as_uuid=True), ForeignKey("documents.id", ondelete="SET NULL"), nullable=True, ) document_filename: Mapped[str | None] = mapped_column(String(512), nullable=True) provider_id: Mapped[str | None] = mapped_column(String(128), nullable=True) model_name: Mapped[str | None] = mapped_column(String(256), nullable=True) prompt_text: Mapped[str | None] = mapped_column(Text, nullable=True) response_text: Mapped[str | None] = mapped_column(Text, nullable=True) payload_json: Mapped[dict] = mapped_column(JSONB, nullable=False, default=dict) @validates("prompt_text", "response_text") def _validate_text_fields(self, key: str, value: str | None) -> str | None: """Redacts and bounds free-text log fields before persistence.""" return sanitize_processing_log_text(value) @validates("payload_json") def _validate_payload_json(self, key: str, value: dict[str, Any] | None) -> dict[str, Any]: """Redacts and bounds structured payload fields before persistence.""" if not isinstance(value, dict): return {} return sanitize_processing_log_payload_value(value)