trading-platform-ml-engine-v2/scripts/ingest_ohlcv_polygon.py
Adrian Flores Cortes d015e2b0f3 feat(ml-engine): Phase 4 - PostgreSQL migration, dynamic OOS, data pipeline
- Fix database.py: Add DatabaseConnection alias for backward compat
- Fix train_symbol_timeframe_models.py: Use PostgreSQLConnection + native queries
- Fix run_oos_backtest.py: Fix broken import + add dynamic OOS support
- Update data_splitter.py: split_dynamic_oos() method (from previous session)
- Update validation_oos.yaml: Dynamic OOS config + all 6 symbols enabled
- Create ingest_ohlcv_polygon.py: Standalone Polygon→PostgreSQL ingestion script
- Fix .gitignore: /data/ instead of data/ to not ignore src/data/
- Add untracked src/ modules: backtesting, data, llm, models (attention/metamodel/strategies)
- Add aiohttp, sqlalchemy, psycopg2-binary to requirements.txt

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-27 04:39:05 -06:00

439 lines
13 KiB
Python

#!/usr/bin/env python3
"""
OHLCV Data Ingestion from Polygon API → PostgreSQL
====================================================
Downloads historical 5-minute OHLCV data for all ML training symbols
and inserts into PostgreSQL market_data.ohlcv_5m table.
Symbols: XAUUSD, EURUSD, GBPUSD, BTCUSD, USDJPY, GBPJPY, AUDUSD
Usage:
# Set API key
export POLYGON_API_KEY="your_key_here"
# Download all symbols (default: 2020-01-01 to today)
python scripts/ingest_ohlcv_polygon.py
# Download specific symbols
python scripts/ingest_ohlcv_polygon.py --symbols XAUUSD EURUSD
# Custom date range
python scripts/ingest_ohlcv_polygon.py --start 2023-01-01 --end 2025-12-31
# Incremental mode (only fetch from last timestamp in DB)
python scripts/ingest_ohlcv_polygon.py --incremental
Author: ML-Specialist (NEXUS v4.0)
Created: 2026-01-27
"""
import argparse
import asyncio
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
import aiohttp
import psycopg2
import psycopg2.extras
from loguru import logger
# Configure logging
logger.remove()
logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
# ============================================================================
# CONFIGURATION
# ============================================================================
POLYGON_BASE_URL = "https://api.polygon.io"
# Symbol → Polygon ticker prefix mapping
SYMBOL_CONFIG = {
"XAUUSD": {"polygon": "C:XAUUSD", "type": "forex"},
"EURUSD": {"polygon": "C:EURUSD", "type": "forex"},
"GBPUSD": {"polygon": "C:GBPUSD", "type": "forex"},
"BTCUSD": {"polygon": "X:BTCUSD", "type": "crypto"},
"USDJPY": {"polygon": "C:USDJPY", "type": "forex"},
"GBPJPY": {"polygon": "C:GBPJPY", "type": "forex"},
"AUDUSD": {"polygon": "C:AUDUSD", "type": "forex"},
}
# PostgreSQL connection (matches trading_platform config)
PG_CONFIG = {
"host": os.getenv("DB_HOST", "localhost"),
"port": int(os.getenv("DB_PORT", "5432")),
"dbname": os.getenv("DB_NAME", "trading_platform"),
"user": os.getenv("DB_USER", "trading_user"),
"password": os.getenv("DB_PASSWORD", "trading_dev_2026"),
}
# ============================================================================
# POLYGON API FETCHER
# ============================================================================
async def fetch_polygon_bars(
api_key: str,
polygon_symbol: str,
start_date: datetime,
end_date: datetime,
multiplier: int = 5,
timespan: str = "minute",
) -> list:
"""
Fetch OHLCV bars from Polygon API with pagination and rate limiting.
Args:
api_key: Polygon API key
polygon_symbol: Full polygon symbol (e.g., 'C:XAUUSD')
start_date: Start date
end_date: End date
multiplier: Timeframe multiplier (5 for 5-min)
timespan: Timeframe span ('minute', 'hour', 'day')
Returns:
List of OHLCV bar dicts
"""
all_bars = []
current_start = start_date
# Chunk by month to respect Polygon's 50k result limit
async with aiohttp.ClientSession() as session:
while current_start < end_date:
chunk_end = min(current_start + timedelta(days=30), end_date)
start_str = current_start.strftime("%Y-%m-%d")
end_str = chunk_end.strftime("%Y-%m-%d")
endpoint = (
f"{POLYGON_BASE_URL}/v2/aggs/ticker/{polygon_symbol}"
f"/range/{multiplier}/{timespan}/{start_str}/{end_str}"
)
params = {
"apiKey": api_key,
"adjusted": "true",
"sort": "asc",
"limit": 50000,
}
try:
async with session.get(endpoint, params=params) as response:
if response.status == 429:
retry_after = int(response.headers.get("Retry-After", 60))
logger.warning(f"Rate limited, waiting {retry_after}s...")
await asyncio.sleep(retry_after)
continue # Retry same chunk
if response.status == 403:
logger.error(f"API key unauthorized (403). Check POLYGON_API_KEY.")
break
if response.status != 200:
text = await response.text()
logger.error(f"API error {response.status}: {text[:200]}")
current_start = chunk_end
continue
data = await response.json()
results = data.get("results", [])
if results:
all_bars.extend(results)
logger.info(
f" {polygon_symbol} {start_str}{end_str}: "
f"{len(results)} bars (total: {len(all_bars)})"
)
else:
logger.debug(f" {polygon_symbol} {start_str}{end_str}: no data")
except aiohttp.ClientError as e:
logger.error(f"Request failed for {polygon_symbol}: {e}")
current_start = chunk_end
await asyncio.sleep(0.5) # Respect rate limits (~2 req/s)
return all_bars
# ============================================================================
# POSTGRESQL INSERTER
# ============================================================================
def get_ticker_id(conn, symbol: str) -> int:
"""
Get ticker_id from market_data.tickers table.
Args:
conn: psycopg2 connection
symbol: Symbol name (e.g., 'XAUUSD')
Returns:
Ticker ID
Raises:
ValueError if ticker not found
"""
with conn.cursor() as cur:
cur.execute(
"SELECT id FROM market_data.tickers WHERE UPPER(symbol) = UPPER(%s)",
(symbol,),
)
row = cur.fetchone()
if not row:
raise ValueError(f"Ticker '{symbol}' not found in market_data.tickers")
return row[0]
def get_last_timestamp(conn, ticker_id: int) -> datetime:
"""Get the last ingested timestamp for a ticker (for incremental sync)."""
with conn.cursor() as cur:
cur.execute(
"SELECT MAX(timestamp) FROM market_data.ohlcv_5m WHERE ticker_id = %s",
(ticker_id,),
)
row = cur.fetchone()
return row[0] if row and row[0] else None
def insert_bars_to_postgres(conn, ticker_id: int, bars: list) -> int:
"""
Insert OHLCV bars into PostgreSQL market_data.ohlcv_5m.
Uses ON CONFLICT for upsert behavior.
Args:
conn: psycopg2 connection
ticker_id: Ticker ID from market_data.tickers
bars: List of Polygon API bar dicts
Returns:
Number of rows inserted/updated
"""
if not bars:
return 0
insert_sql = """
INSERT INTO market_data.ohlcv_5m
(ticker_id, timestamp, open, high, low, close, volume, vwap, ts_epoch)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (ticker_id, timestamp) DO UPDATE SET
open = EXCLUDED.open,
high = EXCLUDED.high,
low = EXCLUDED.low,
close = EXCLUDED.close,
volume = EXCLUDED.volume,
vwap = EXCLUDED.vwap
"""
# Convert bars to tuples
rows = []
for bar in bars:
ts_ms = bar["t"] # milliseconds epoch
timestamp = datetime.fromtimestamp(ts_ms / 1000)
rows.append((
ticker_id,
timestamp,
bar["o"], # open
bar["h"], # high
bar["l"], # low
bar["c"], # close
bar.get("v", 0), # volume
bar.get("vw") or None, # vwap (nullable)
ts_ms, # ts_epoch in ms
))
# Insert in batches
batch_size = 5000
total_inserted = 0
with conn.cursor() as cur:
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
psycopg2.extras.execute_batch(cur, insert_sql, batch, page_size=1000)
conn.commit()
total_inserted += len(batch)
return total_inserted
# ============================================================================
# MAIN
# ============================================================================
async def ingest_symbol(
api_key: str,
conn,
symbol: str,
start_date: datetime,
end_date: datetime,
incremental: bool = False,
) -> dict:
"""
Full ingestion pipeline for a single symbol.
Returns:
Dict with results summary
"""
config = SYMBOL_CONFIG.get(symbol)
if not config:
logger.error(f"Unknown symbol: {symbol}")
return {"symbol": symbol, "status": "error", "error": "Unknown symbol"}
polygon_symbol = config["polygon"]
# Get ticker ID from DB
try:
ticker_id = get_ticker_id(conn, symbol)
except ValueError as e:
logger.error(str(e))
return {"symbol": symbol, "status": "error", "error": str(e)}
# In incremental mode, start from last timestamp + 5 min
actual_start = start_date
if incremental:
last_ts = get_last_timestamp(conn, ticker_id)
if last_ts:
actual_start = last_ts + timedelta(minutes=5)
logger.info(f" Incremental: continuing from {actual_start}")
if actual_start >= end_date:
logger.info(f" {symbol}: already up to date")
return {"symbol": symbol, "status": "up_to_date", "rows": 0}
# Fetch from Polygon
logger.info(f"Fetching {symbol} ({polygon_symbol}) from {actual_start.date()} to {end_date.date()}...")
bars = await fetch_polygon_bars(
api_key=api_key,
polygon_symbol=polygon_symbol,
start_date=actual_start,
end_date=end_date,
)
if not bars:
logger.warning(f" {symbol}: no data received from API")
return {"symbol": symbol, "status": "no_data", "rows": 0}
# Show range
first_ts = datetime.fromtimestamp(bars[0]["t"] / 1000)
last_ts = datetime.fromtimestamp(bars[-1]["t"] / 1000)
logger.info(f" {symbol}: {len(bars)} bars from {first_ts} to {last_ts}")
# Insert into PostgreSQL
logger.info(f" Inserting {len(bars)} bars into market_data.ohlcv_5m...")
inserted = insert_bars_to_postgres(conn, ticker_id, bars)
logger.info(f" {symbol}: {inserted} rows inserted/updated")
return {
"symbol": symbol,
"status": "success",
"rows": inserted,
"first_bar": str(first_ts),
"last_bar": str(last_ts),
}
async def main():
parser = argparse.ArgumentParser(description="Ingest OHLCV data from Polygon → PostgreSQL")
parser.add_argument(
"--symbols",
nargs="+",
default=list(SYMBOL_CONFIG.keys()),
help=f"Symbols to ingest (default: all {len(SYMBOL_CONFIG)})",
)
parser.add_argument(
"--start",
type=str,
default="2020-01-01",
help="Start date YYYY-MM-DD (default: 2020-01-01)",
)
parser.add_argument(
"--end",
type=str,
default=datetime.now().strftime("%Y-%m-%d"),
help="End date YYYY-MM-DD (default: today)",
)
parser.add_argument(
"--incremental",
action="store_true",
help="Only fetch data after last timestamp in DB",
)
parser.add_argument(
"--api-key",
type=str,
default=os.getenv("POLYGON_API_KEY", ""),
help="Polygon API key (or set POLYGON_API_KEY env var)",
)
args = parser.parse_args()
# Validate API key
api_key = args.api_key
if not api_key:
logger.error("POLYGON_API_KEY is required. Set env var or use --api-key")
sys.exit(1)
start_date = datetime.strptime(args.start, "%Y-%m-%d")
end_date = datetime.strptime(args.end, "%Y-%m-%d")
logger.info("=" * 60)
logger.info("OHLCV Data Ingestion: Polygon → PostgreSQL")
logger.info("=" * 60)
logger.info(f"Symbols: {args.symbols}")
logger.info(f"Date range: {start_date.date()}{end_date.date()}")
logger.info(f"Incremental: {args.incremental}")
logger.info(f"Database: {PG_CONFIG['host']}:{PG_CONFIG['port']}/{PG_CONFIG['dbname']}")
# Connect to PostgreSQL
try:
conn = psycopg2.connect(**PG_CONFIG)
logger.info("PostgreSQL connected")
except Exception as e:
logger.error(f"PostgreSQL connection failed: {e}")
sys.exit(1)
# Process each symbol
results = []
for symbol in args.symbols:
logger.info(f"\n{'='*40}")
logger.info(f"Processing {symbol}")
logger.info(f"{'='*40}")
result = await ingest_symbol(
api_key=api_key,
conn=conn,
symbol=symbol,
start_date=start_date,
end_date=end_date,
incremental=args.incremental,
)
results.append(result)
conn.close()
# Print summary
logger.info("\n" + "=" * 60)
logger.info("INGESTION SUMMARY")
logger.info("=" * 60)
total_rows = 0
for r in results:
status_icon = {
"success": "[OK]",
"no_data": "[EMPTY]",
"up_to_date": "[SKIP]",
"error": "[FAIL]",
}.get(r["status"], "[?]")
rows = r.get("rows", 0)
total_rows += rows
logger.info(f" {status_icon} {r['symbol']}: {rows:,} rows")
logger.info(f"\nTotal rows inserted: {total_rows:,}")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())