-- ===================================================== -- LLM SCHEMA - EMBEDDINGS TABLE -- ===================================================== -- Description: Vector embeddings for RAG and semantic search -- Schema: llm -- Author: Database Agent -- Date: 2025-12-06 -- ===================================================== -- NOTA: Requiere extensión pgvector -- CREATE EXTENSION IF NOT EXISTS vector; CREATE TABLE llm.embeddings ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), -- Tipo de contenido content_type VARCHAR(50) NOT NULL, -- message, document, faq, tutorial, article -- Referencia al contenido original content_id UUID, -- ID del mensaje, documento, etc. -- Contenido content TEXT NOT NULL, content_hash VARCHAR(64), -- SHA-256 para deduplicación -- Metadata del contenido title VARCHAR(500), description TEXT, -- Vector embedding (dimensión depende del modelo) -- OpenAI text-embedding-3-small: 1536 dims -- OpenAI text-embedding-3-large: 3072 dims -- Voyage AI: 1024 dims embedding vector(1536), -- Ajustar según modelo usado -- Modelo usado para generar embedding embedding_model VARCHAR(100) NOT NULL, -- text-embedding-3-small, voyage-2, etc. -- Metadata para filtrado user_id UUID REFERENCES auth.users(id) ON DELETE CASCADE, -- Si es contenido específico de usuario is_public BOOLEAN DEFAULT true, -- Categorización category VARCHAR(100), -- education, trading, market_news, platform_help subcategory VARCHAR(100), tags TEXT[] DEFAULT '{}', -- Relevancia importance_score DECIMAL(3,2) DEFAULT 0.50, -- Contexto adicional context_metadata JSONB, -- Metadata adicional para mejorar recuperación -- Fuente source_url VARCHAR(500), source_type VARCHAR(50), -- internal, external, generated -- Validez is_active BOOLEAN DEFAULT true, expires_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() ); -- Índices CREATE INDEX idx_embeddings_type ON llm.embeddings(content_type); CREATE INDEX idx_embeddings_user ON llm.embeddings(user_id) WHERE user_id IS NOT NULL; CREATE INDEX idx_embeddings_category ON llm.embeddings(category); CREATE INDEX idx_embeddings_tags ON llm.embeddings USING GIN(tags); CREATE INDEX idx_embeddings_active ON llm.embeddings(is_active) WHERE is_active = true; CREATE INDEX idx_embeddings_hash ON llm.embeddings(content_hash); -- Índice para búsqueda vectorial (HNSW para mejor performance) -- Requiere pgvector CREATE INDEX idx_embeddings_vector_hnsw ON llm.embeddings USING hnsw (embedding vector_cosine_ops); -- Índice alternativo: IVFFlat (más rápido de construir, menos preciso) -- CREATE INDEX idx_embeddings_vector_ivfflat ON llm.embeddings -- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); -- Comentarios COMMENT ON TABLE llm.embeddings IS 'Vector embeddings for RAG and semantic search using pgvector'; COMMENT ON COLUMN llm.embeddings.content_type IS 'Type of content: message, document, faq, tutorial, article'; COMMENT ON COLUMN llm.embeddings.content_id IS 'Reference to original content (e.g., message ID)'; COMMENT ON COLUMN llm.embeddings.content IS 'Text content that was embedded'; COMMENT ON COLUMN llm.embeddings.content_hash IS 'SHA-256 hash for deduplication'; COMMENT ON COLUMN llm.embeddings.embedding IS 'Vector embedding (dimension depends on model)'; COMMENT ON COLUMN llm.embeddings.embedding_model IS 'Model used to generate embedding'; COMMENT ON COLUMN llm.embeddings.is_public IS 'Whether embedding is accessible to all users or user-specific'; COMMENT ON COLUMN llm.embeddings.importance_score IS 'Relevance score for retrieval prioritization'; -- Ejemplo de uso para búsqueda semántica: COMMENT ON TABLE llm.embeddings IS 'Vector search example: SELECT content, title, 1 - (embedding <=> query_embedding) AS similarity FROM llm.embeddings WHERE is_active = true AND category = ''education'' ORDER BY embedding <=> query_embedding LIMIT 5; Operators: - <-> : L2 distance - <#> : inner product - <=> : cosine distance (recommended)'; -- Ejemplo de context_metadata JSONB: COMMENT ON COLUMN llm.embeddings.context_metadata IS 'Example: { "language": "es", "difficulty_level": "beginner", "reading_time_minutes": 5, "author": "system", "last_updated": "2025-12-01", "related_symbols": ["BTCUSDT"], "related_topics": ["technical_analysis", "rsi"] }';