DDL schemas for Trading Platform: - User management - Authentication - Payments - Education - ML predictions - Trading data Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
123 lines
4.3 KiB
SQL
123 lines
4.3 KiB
SQL
-- =====================================================
|
|
-- LLM SCHEMA - EMBEDDINGS TABLE
|
|
-- =====================================================
|
|
-- Description: Vector embeddings for RAG and semantic search
|
|
-- Schema: llm
|
|
-- Author: Database Agent
|
|
-- Date: 2025-12-06
|
|
-- =====================================================
|
|
|
|
-- NOTA: Requiere extensión pgvector
|
|
-- CREATE EXTENSION IF NOT EXISTS vector;
|
|
|
|
CREATE TABLE llm.embeddings (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
|
|
-- Tipo de contenido
|
|
content_type VARCHAR(50) NOT NULL, -- message, document, faq, tutorial, article
|
|
|
|
-- Referencia al contenido original
|
|
content_id UUID, -- ID del mensaje, documento, etc.
|
|
|
|
-- Contenido
|
|
content TEXT NOT NULL,
|
|
content_hash VARCHAR(64), -- SHA-256 para deduplicación
|
|
|
|
-- Metadata del contenido
|
|
title VARCHAR(500),
|
|
description TEXT,
|
|
|
|
-- Vector embedding (dimensión depende del modelo)
|
|
-- OpenAI text-embedding-3-small: 1536 dims
|
|
-- OpenAI text-embedding-3-large: 3072 dims
|
|
-- Voyage AI: 1024 dims
|
|
embedding vector(1536), -- Ajustar según modelo usado
|
|
|
|
-- Modelo usado para generar embedding
|
|
embedding_model VARCHAR(100) NOT NULL, -- text-embedding-3-small, voyage-2, etc.
|
|
|
|
-- Metadata para filtrado
|
|
user_id UUID REFERENCES auth.users(id) ON DELETE CASCADE, -- Si es contenido específico de usuario
|
|
is_public BOOLEAN DEFAULT true,
|
|
|
|
-- Categorización
|
|
category VARCHAR(100), -- education, trading, market_news, platform_help
|
|
subcategory VARCHAR(100),
|
|
tags TEXT[] DEFAULT '{}',
|
|
|
|
-- Relevancia
|
|
importance_score DECIMAL(3,2) DEFAULT 0.50,
|
|
|
|
-- Contexto adicional
|
|
context_metadata JSONB, -- Metadata adicional para mejorar recuperación
|
|
|
|
-- Fuente
|
|
source_url VARCHAR(500),
|
|
source_type VARCHAR(50), -- internal, external, generated
|
|
|
|
-- Validez
|
|
is_active BOOLEAN DEFAULT true,
|
|
expires_at TIMESTAMPTZ,
|
|
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- Índices
|
|
CREATE INDEX idx_embeddings_type ON llm.embeddings(content_type);
|
|
CREATE INDEX idx_embeddings_user ON llm.embeddings(user_id) WHERE user_id IS NOT NULL;
|
|
CREATE INDEX idx_embeddings_category ON llm.embeddings(category);
|
|
CREATE INDEX idx_embeddings_tags ON llm.embeddings USING GIN(tags);
|
|
CREATE INDEX idx_embeddings_active ON llm.embeddings(is_active) WHERE is_active = true;
|
|
CREATE INDEX idx_embeddings_hash ON llm.embeddings(content_hash);
|
|
|
|
-- Índice para búsqueda vectorial (HNSW para mejor performance)
|
|
-- Requiere pgvector
|
|
CREATE INDEX idx_embeddings_vector_hnsw ON llm.embeddings
|
|
USING hnsw (embedding vector_cosine_ops);
|
|
|
|
-- Índice alternativo: IVFFlat (más rápido de construir, menos preciso)
|
|
-- CREATE INDEX idx_embeddings_vector_ivfflat ON llm.embeddings
|
|
-- USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
|
|
|
|
-- Comentarios
|
|
COMMENT ON TABLE llm.embeddings IS 'Vector embeddings for RAG and semantic search using pgvector';
|
|
COMMENT ON COLUMN llm.embeddings.content_type IS 'Type of content: message, document, faq, tutorial, article';
|
|
COMMENT ON COLUMN llm.embeddings.content_id IS 'Reference to original content (e.g., message ID)';
|
|
COMMENT ON COLUMN llm.embeddings.content IS 'Text content that was embedded';
|
|
COMMENT ON COLUMN llm.embeddings.content_hash IS 'SHA-256 hash for deduplication';
|
|
COMMENT ON COLUMN llm.embeddings.embedding IS 'Vector embedding (dimension depends on model)';
|
|
COMMENT ON COLUMN llm.embeddings.embedding_model IS 'Model used to generate embedding';
|
|
COMMENT ON COLUMN llm.embeddings.is_public IS 'Whether embedding is accessible to all users or user-specific';
|
|
COMMENT ON COLUMN llm.embeddings.importance_score IS 'Relevance score for retrieval prioritization';
|
|
|
|
-- Ejemplo de uso para búsqueda semántica:
|
|
COMMENT ON TABLE llm.embeddings IS
|
|
'Vector search example:
|
|
SELECT
|
|
content,
|
|
title,
|
|
1 - (embedding <=> query_embedding) AS similarity
|
|
FROM llm.embeddings
|
|
WHERE is_active = true
|
|
AND category = ''education''
|
|
ORDER BY embedding <=> query_embedding
|
|
LIMIT 5;
|
|
|
|
Operators:
|
|
- <-> : L2 distance
|
|
- <#> : inner product
|
|
- <=> : cosine distance (recommended)';
|
|
|
|
-- Ejemplo de context_metadata JSONB:
|
|
COMMENT ON COLUMN llm.embeddings.context_metadata IS
|
|
'Example: {
|
|
"language": "es",
|
|
"difficulty_level": "beginner",
|
|
"reading_time_minutes": 5,
|
|
"author": "system",
|
|
"last_updated": "2025-12-01",
|
|
"related_symbols": ["BTCUSDT"],
|
|
"related_topics": ["technical_analysis", "rsi"]
|
|
}';
|