155 lines
5.6 KiB
Python
155 lines
5.6 KiB
Python
"""Data model representing one persisted processing pipeline log entry."""
|
|
|
|
import uuid
|
|
from datetime import UTC, datetime
|
|
import re
|
|
from typing import Any
|
|
|
|
from sqlalchemy import BigInteger, DateTime, ForeignKey, String, Text
|
|
from sqlalchemy.dialects.postgresql import JSONB, UUID
|
|
from sqlalchemy.orm import Mapped, mapped_column, validates
|
|
|
|
from app.core.config import get_settings
|
|
from app.db.base import Base
|
|
|
|
|
|
settings = get_settings()
|
|
|
|
|
|
SENSITIVE_KEY_MARKERS = (
|
|
"api_key",
|
|
"apikey",
|
|
"authorization",
|
|
"bearer",
|
|
"token",
|
|
"secret",
|
|
"password",
|
|
"credential",
|
|
"cookie",
|
|
)
|
|
SENSITIVE_TEXT_PATTERNS = (
|
|
re.compile(r"(?i)[\"']authorization[\"']\s*:\s*[\"']bearer\s+[^\"']+[\"']"),
|
|
re.compile(r"(?i)[\"']bearer[\"']\s*:\s*[\"'][^\"']+[\"']"),
|
|
re.compile(r"(?i)[\"'](?:api[_-]?key|token|secret|password)[\"']\s*:\s*[\"'][^\"']+[\"']"),
|
|
re.compile(r"(?i)\bauthorization\b\s*[:=]\s*bearer\s+[a-z0-9._~+/\-]+=*"),
|
|
re.compile(r"(?i)\bbearer\s+[a-z0-9._~+/\-]+=*"),
|
|
re.compile(r"\b[a-z0-9_-]{8,}\.[a-z0-9_-]{8,}\.[a-z0-9_-]{8,}\b", flags=re.IGNORECASE),
|
|
re.compile(r"(?i)\bsk-[a-z0-9]{16,}\b"),
|
|
re.compile(r"(?i)\b(api[_-]?key|token|secret|password)\b\s*[:=]\s*['\"]?[^\s,'\";]+['\"]?"),
|
|
)
|
|
REDACTED_TEXT = "[REDACTED]"
|
|
MAX_PAYLOAD_KEYS = 80
|
|
MAX_PAYLOAD_LIST_ITEMS = 80
|
|
|
|
|
|
def _truncate(value: str, limit: int) -> str:
|
|
"""Truncates long log fields to configured bounds with stable suffix marker."""
|
|
|
|
normalized = value.strip()
|
|
if len(normalized) <= limit:
|
|
return normalized
|
|
return normalized[: max(0, limit - 3)] + "..."
|
|
|
|
|
|
def _is_sensitive_key(key: str) -> bool:
|
|
"""Returns whether a payload key likely contains sensitive credential data."""
|
|
|
|
normalized = key.strip().lower()
|
|
return any(marker in normalized for marker in SENSITIVE_KEY_MARKERS)
|
|
|
|
|
|
def _redact_sensitive_text(value: str) -> str:
|
|
"""Redacts token-like segments from log text while retaining non-sensitive context."""
|
|
|
|
redacted = value
|
|
for pattern in SENSITIVE_TEXT_PATTERNS:
|
|
redacted = pattern.sub(lambda _: REDACTED_TEXT, redacted)
|
|
return redacted
|
|
|
|
|
|
def sanitize_processing_log_payload_value(value: Any, *, parent_key: str | None = None) -> Any:
|
|
"""Sanitizes payload structures by redacting sensitive fields and bounding size."""
|
|
|
|
if parent_key and _is_sensitive_key(parent_key):
|
|
return REDACTED_TEXT
|
|
|
|
if isinstance(value, dict):
|
|
sanitized: dict[str, Any] = {}
|
|
for index, (raw_key, raw_value) in enumerate(value.items()):
|
|
if index >= MAX_PAYLOAD_KEYS:
|
|
break
|
|
key = str(raw_key)
|
|
sanitized[key] = sanitize_processing_log_payload_value(raw_value, parent_key=key)
|
|
return sanitized
|
|
|
|
if isinstance(value, list):
|
|
return [
|
|
sanitize_processing_log_payload_value(item, parent_key=parent_key)
|
|
for item in value[:MAX_PAYLOAD_LIST_ITEMS]
|
|
]
|
|
|
|
if isinstance(value, tuple):
|
|
return [
|
|
sanitize_processing_log_payload_value(item, parent_key=parent_key)
|
|
for item in list(value)[:MAX_PAYLOAD_LIST_ITEMS]
|
|
]
|
|
|
|
if isinstance(value, str):
|
|
redacted = _redact_sensitive_text(value)
|
|
return _truncate(redacted, settings.processing_log_max_payload_chars)
|
|
|
|
if isinstance(value, (int, float, bool)) or value is None:
|
|
return value
|
|
|
|
as_text = _truncate(str(value), settings.processing_log_max_payload_chars)
|
|
return _redact_sensitive_text(as_text)
|
|
|
|
|
|
def sanitize_processing_log_text(value: str | None) -> str | None:
|
|
"""Sanitizes prompt and response fields by redacting credentials and clamping length."""
|
|
|
|
if value is None:
|
|
return None
|
|
normalized = value.strip()
|
|
if not normalized:
|
|
return None
|
|
redacted = _redact_sensitive_text(normalized)
|
|
return _truncate(redacted, settings.processing_log_max_text_chars)
|
|
|
|
|
|
class ProcessingLogEntry(Base):
|
|
"""Stores a timestamped processing event with optional model prompt and response text."""
|
|
|
|
__tablename__ = "processing_logs"
|
|
|
|
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, default=lambda: datetime.now(UTC))
|
|
level: Mapped[str] = mapped_column(String(16), nullable=False, default="info")
|
|
stage: Mapped[str] = mapped_column(String(64), nullable=False)
|
|
event: Mapped[str] = mapped_column(String(256), nullable=False)
|
|
document_id: Mapped[uuid.UUID | None] = mapped_column(
|
|
UUID(as_uuid=True),
|
|
ForeignKey("documents.id", ondelete="SET NULL"),
|
|
nullable=True,
|
|
)
|
|
document_filename: Mapped[str | None] = mapped_column(String(512), nullable=True)
|
|
provider_id: Mapped[str | None] = mapped_column(String(128), nullable=True)
|
|
model_name: Mapped[str | None] = mapped_column(String(256), nullable=True)
|
|
prompt_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
response_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
|
payload_json: Mapped[dict] = mapped_column(JSONB, nullable=False, default=dict)
|
|
|
|
@validates("prompt_text", "response_text")
|
|
def _validate_text_fields(self, key: str, value: str | None) -> str | None:
|
|
"""Redacts and bounds free-text log fields before persistence."""
|
|
|
|
return sanitize_processing_log_text(value)
|
|
|
|
@validates("payload_json")
|
|
def _validate_payload_json(self, key: str, value: dict[str, Any] | None) -> dict[str, Any]:
|
|
"""Redacts and bounds structured payload fields before persistence."""
|
|
|
|
if not isinstance(value, dict):
|
|
return {}
|
|
return sanitize_processing_log_payload_value(value)
|