Adrian Flores Cortes 3def230d58 Initial commit: local-llm-agent infrastructure project

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

2026-02-02 16:42:45 -06:00

17 KiB

Raw Blame History

Modelo de Datos y Definiciones de Objetos - Local LLM Agent

Version: 1.0.0 Fecha: 2026-01-20 Proyecto: local-llm-agent

1. VISION GENERAL

Este documento define los objetos de datos, interfaces y estructuras que componen el sistema Local LLM Agent.

1.1 Diagrama de Componentes

┌─────────────────────────────────────────────────────────────────────┐
│                         API GATEWAY (NestJS)                        │
│                                                                     │
│  ┌─────────────────────────────────────────────────────────────┐   │
│  │                    Request Pipeline                          │   │
│  │                                                              │   │
│  │  [Request] → [Validation] → [TierClassifier] → [Router]     │   │
│  │                                                     │        │   │
│  └─────────────────────────────────────────────────────┼────────┘   │
│                                                        │            │
│  ┌──────────────────┐  ┌──────────────────┐  ┌────────┴────────┐   │
│  │   ChatModule     │  │   ModelsModule   │  │  MCPToolsModule │   │
│  │                  │  │                  │  │                 │   │
│  │ ChatController   │  │ ModelsController │  │ MCPController   │   │
│  │ ChatService      │  │ ModelsService    │  │ MCPService      │   │
│  │ TierService      │  │                  │  │ ToolsRegistry   │   │
│  └────────┬─────────┘  └────────┬─────────┘  └────────┬────────┘   │
│           │                     │                     │            │
└───────────┼─────────────────────┼─────────────────────┼────────────┘
            │                     │                     │
            └─────────────────────┼─────────────────────┘
                                  │
                         [InferenceClient]
                                  │
                                  ▼
┌─────────────────────────────────────────────────────────────────────┐
│                    INFERENCE ENGINE (Python)                        │
│                                                                     │
│  ┌─────────────────────────────────────────────────────────────┐   │
│  │                    Backend Manager                           │   │
│  │                                                              │   │
│  │  [BackendFactory] → [OllamaBackend | VLLMBackend]           │   │
│  └─────────────────────────────────────────────────────────────┘   │
│                                  │                                  │
│  ┌──────────────────────────────┼──────────────────────────────┐   │
│  │                         Routes                               │   │
│  │                                                              │   │
│  │  [/v1/chat/completions]  [/v1/models]  [/health]            │   │
│  └──────────────────────────────────────────────────────────────┘   │
└─────────────────────────────────────────────────────────────────────┘

2. OBJETOS DE DOMINIO

2.1 Chat Completion

ChatMessage

/**
 * Mensaje individual en una conversacion
 */
interface ChatMessage {
  /** Rol del emisor del mensaje */
  role: "system" | "user" | "assistant";

  /** Contenido textual del mensaje */
  content: string;

  /** Nombre opcional del emisor */
  name?: string;
}

ChatCompletionRequest

/**
 * Request para crear una completion de chat
 * Compatible con OpenAI API
 */
interface ChatCompletionRequest {
  /** Identificador del modelo a usar */
  model: string;

  /** Lista de mensajes de la conversacion */
  messages: ChatMessage[];

  /** Maximo de tokens a generar (default: 512) */
  max_tokens?: number;

  /** Temperatura de muestreo 0.0-2.0 (default: 0.7) */
  temperature?: number;

  /** Top-p sampling 0.0-1.0 (default: 0.9) */
  top_p?: number;

  /** Si retornar respuesta en streaming (default: false) */
  stream?: boolean;

  /** Tier forzado (opcional, normalmente auto-detectado) */
  x_tier?: "small" | "main";
}

// Constraints
const ChatCompletionRequestConstraints = {
  model: { required: true, maxLength: 100 },
  messages: { required: true, minItems: 1, maxItems: 100 },
  max_tokens: { min: 1, max: 4096 },
  temperature: { min: 0.0, max: 2.0 },
  top_p: { min: 0.0, max: 1.0 },
};

ChatCompletionResponse

/**
 * Response de chat completion
 * Compatible con OpenAI API
 */
interface ChatCompletionResponse {
  /** ID unico de la completion */
  id: string;

  /** Tipo de objeto */
  object: "chat.completion";

  /** Timestamp de creacion (Unix) */
  created: number;

  /** Modelo usado */
  model: string;

  /** Lista de opciones generadas */
  choices: ChatCompletionChoice[];

  /** Estadisticas de uso de tokens */
  usage: TokenUsage;
}

interface ChatCompletionChoice {
  /** Indice de la opcion */
  index: number;

  /** Mensaje generado */
  message: ChatMessage;

  /** Razon de finalizacion */
  finish_reason: "stop" | "length" | "content_filter";
}

interface TokenUsage {
  /** Tokens en el prompt */
  prompt_tokens: number;

  /** Tokens generados */
  completion_tokens: number;

  /** Total de tokens */
  total_tokens: number;
}

2.2 Models

Model

/**
 * Modelo disponible para inferencia
 */
interface Model {
  /** Identificador unico del modelo */
  id: string;

  /** Tipo de objeto */
  object: "model";

  /** Timestamp de creacion */
  created: number;

  /** Propietario del modelo */
  owned_by: string;

  /** Permisos (vacio para modelos locales) */
  permission: [];

  /** Modelo raiz */
  root: string;

  /** Modelo padre (null si es base) */
  parent: string | null;
}

interface ModelsListResponse {
  object: "list";
  data: Model[];
}

2.3 MCP Tools

MCPTool

/**
 * Definicion de herramienta MCP
 */
interface MCPTool {
  /** Nombre unico de la herramienta */
  name: string;

  /** Descripcion legible */
  description: string;

  /** Schema JSON de parametros */
  parameters: JSONSchema7;

  /** Tier preferido para esta herramienta */
  preferred_tier: "small" | "main";

  /** Version de la herramienta */
  version: string;
}

interface MCPToolsListResponse {
  tools: MCPTool[];
}

MCPToolRequest

/**
 * Request generico para ejecutar herramienta MCP
 */
interface MCPToolRequest {
  /** Nombre de la herramienta */
  tool: string;

  /** Parametros segun schema de la herramienta */
  parameters: Record<string, unknown>;

  /** Contexto adicional opcional */
  context?: string;
}

Herramientas Especificas

// Classify Tool
interface ClassifyRequest {
  text: string;
  categories: string[];
  context?: string;
}

interface ClassifyResponse {
  category: string;
  confidence: number;
  reasoning?: string;
}

// Extract Tool
interface ExtractRequest {
  text: string;
  schema: {
    fields: Array<{
      name: string;
      type: "string" | "number" | "date" | "boolean" | "array";
      description: string;
      required?: boolean;
    }>;
  };
}

interface ExtractResponse {
  data: Record<string, unknown>;
  confidence: number;
  missing_fields?: string[];
}

// Summarize Tool
interface SummarizeRequest {
  text: string;
  max_length?: number;
  format?: "paragraph" | "bullets";
}

interface SummarizeResponse {
  summary: string;
  word_count: number;
  key_points?: string[];
}

// Rewrite Tool
interface RewriteRequest {
  text: string;
  style: "formal" | "casual" | "technical" | "simple";
  preserve_length?: boolean;
}

interface RewriteResponse {
  rewritten: string;
  changes_made: number;
}

2.4 System Objects

Health Status

/**
 * Estado de salud del sistema
 */
interface HealthStatus {
  /** Estado general */
  status: "healthy" | "degraded" | "unhealthy";

  /** Timestamp de verificacion */
  timestamp: string;

  /** Version del servicio */
  version: string;

  /** Estado de dependencias */
  dependencies: {
    inference_engine: DependencyStatus;
    ollama: DependencyStatus;
    redis?: DependencyStatus;
  };

  /** Metricas de sistema */
  metrics?: {
    uptime_seconds: number;
    requests_total: number;
    requests_failed: number;
  };
}

type DependencyStatus = "up" | "down" | "degraded";

TierConfig

/**
 * Configuracion de tiers de procesamiento
 */
interface TierConfig {
  small: {
    /** Maximo contexto en tokens */
    max_context: number;  // 4096

    /** Maximo tokens de salida */
    max_tokens: number;   // 512

    /** Latencia objetivo en ms */
    latency_target_ms: number;  // 500
  };

  main: {
    max_context: number;  // 16384
    max_tokens: number;   // 2048
    latency_target_ms: number;  // 2000
  };
}

// Default configuration
const DEFAULT_TIER_CONFIG: TierConfig = {
  small: {
    max_context: 4096,
    max_tokens: 512,
    latency_target_ms: 500,
  },
  main: {
    max_context: 16384,
    max_tokens: 2048,
    latency_target_ms: 2000,
  },
};

Error Response

/**
 * Respuesta de error estandarizada
 * Compatible con formato OpenAI
 */
interface ErrorResponse {
  error: {
    /** Codigo de error */
    code: string;

    /** Mensaje descriptivo */
    message: string;

    /** Tipo de error */
    type: "invalid_request_error" | "authentication_error" | "rate_limit_error" | "server_error";

    /** Parametro que causo el error (si aplica) */
    param?: string;
  };
}

// Codigos de error
enum ErrorCodes {
  INVALID_REQUEST = "invalid_request",
  MODEL_NOT_FOUND = "model_not_found",
  CONTEXT_TOO_LONG = "context_length_exceeded",
  INFERENCE_TIMEOUT = "inference_timeout",
  BACKEND_UNAVAILABLE = "backend_unavailable",
  RATE_LIMITED = "rate_limited",
  INTERNAL_ERROR = "internal_error",
}

3. MODELOS DE BACKEND (Python)

3.1 Backend Interface

from abc import ABC, abstractmethod
from typing import Any, Dict, List

class InferenceBackend(ABC):
    """
    Interface abstracta para backends de inferencia.
    Implementada por OllamaBackend, VLLMBackend.
    """

    @abstractmethod
    async def health_check(self) -> bool:
        """Verificar si el backend esta disponible."""
        pass

    @abstractmethod
    async def list_models(self) -> List[Dict[str, Any]]:
        """Listar modelos disponibles."""
        pass

    @abstractmethod
    async def chat_completion(
        self,
        model: str,
        messages: List[Dict[str, str]],
        max_tokens: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.9,
    ) -> Dict[str, Any]:
        """
        Crear chat completion.

        Args:
            model: Identificador del modelo
            messages: Lista de mensajes [{"role": str, "content": str}]
            max_tokens: Maximo tokens a generar
            temperature: Temperatura de muestreo
            top_p: Top-p sampling

        Returns:
            Dict con id, content, usage, finish_reason
        """
        pass

    @abstractmethod
    async def close(self) -> None:
        """Cerrar conexiones del backend."""
        pass

3.2 Pydantic Models

from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

class MessageRole(str, Enum):
    SYSTEM = "system"
    USER = "user"
    ASSISTANT = "assistant"

class Message(BaseModel):
    role: MessageRole
    content: str = Field(..., min_length=1)

class ChatCompletionRequest(BaseModel):
    model: str = Field(default="gpt-oss-20b")
    messages: List[Message] = Field(..., min_items=1)
    max_tokens: Optional[int] = Field(default=512, ge=1, le=4096)
    temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)
    top_p: Optional[float] = Field(default=0.9, ge=0.0, le=1.0)
    stream: Optional[bool] = Field(default=False)

class Usage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int

class Choice(BaseModel):
    index: int
    message: Message
    finish_reason: str

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]
    usage: Usage

4. DIAGRAMA ENTIDAD-RELACION

Este proyecto NO usa base de datos persistente en MVP.

Flujo de datos es request-response:

[AgentRequest]
     │
     ▼
[ChatCompletionRequest] ──────> [ChatCompletionResponse]
                                        │
                                        ├── [ChatCompletionChoice]
                                        │         │
                                        │         └── [ChatMessage]
                                        │
                                        └── [TokenUsage]

Datos persistidos en futuro (Fase 2+):
- Request logs (para analytics)
- Token usage metrics
- Model performance metrics

5. CONFIGURACION Y CONSTANTES

5.1 Environment Variables

// Gateway Configuration
interface GatewayConfig {
  GATEWAY_PORT: number;         // Default: 3160
  INFERENCE_HOST: string;       // Default: "localhost"
  INFERENCE_PORT: number;       // Default: 3161
  LOG_LEVEL: "debug" | "info" | "warn" | "error";  // Default: "info"
  CORS_ORIGINS: string;         // Default: "*"
}

// Inference Engine Configuration
interface InferenceConfig {
  INFERENCE_PORT: number;       // Default: 3161
  INFERENCE_BACKEND: "ollama" | "vllm";  // Default: "ollama"
  OLLAMA_HOST: string;          // Default: "http://localhost:11434"
  OLLAMA_MODEL: string;         // Default: "gpt-oss-20b"
  REQUEST_TIMEOUT_MS: number;   // Default: 60000
  LOG_LEVEL: string;            // Default: "info"
}

5.2 Constantes del Sistema

// Limites de sistema
const SYSTEM_LIMITS = {
  MAX_MESSAGE_LENGTH: 100000,   // caracteres
  MAX_MESSAGES_PER_REQUEST: 100,
  MAX_CONTEXT_TOKENS: 16384,
  MAX_OUTPUT_TOKENS: 4096,
  REQUEST_TIMEOUT_MS: 60000,
  HEALTH_CHECK_INTERVAL_MS: 30000,
};

// Valores por defecto
const DEFAULTS = {
  MODEL: "gpt-oss-20b",
  MAX_TOKENS: 512,
  TEMPERATURE: 0.7,
  TOP_P: 0.9,
  TIER: "small",
};

// Puertos asignados
const PORTS = {
  GATEWAY: 3160,
  INFERENCE_ENGINE: 3161,
  OLLAMA: 11434,
};

6. VALIDACIONES

6.1 Request Validations

Campo	Validacion	Error Code
model	No vacio, max 100 chars	invalid_request
messages	Array no vacio, max 100 items	invalid_request
messages[].role	Enum: system/user/assistant	invalid_request
messages[].content	No vacio	invalid_request
max_tokens	1-4096	invalid_request
temperature	0.0-2.0	invalid_request
top_p	0.0-1.0	invalid_request

6.2 Business Validations

Regla	Descripcion	Error Code
Contexto excedido	Total tokens > max_context del tier	context_length_exceeded
Modelo no existe	Modelo no disponible en backend	model_not_found
Backend no disponible	Ollama no responde	backend_unavailable
Timeout	Request excede timeout	inference_timeout

7. REFERENCIAS

RF-REQUERIMIENTOS-FUNCIONALES.md
RNF-REQUERIMIENTOS-NO-FUNCIONALES.md
ADR-001: Runtime Selection
ADR-002: Model Selection

Documento Controlado

Autor: Requirements-Analyst Agent
Fecha: 2026-01-20

17 KiB Raw Blame History