trading-platform-ml-engine-v2/scripts/ingest_ohlcv_polygon.py
Adrian Flores Cortes dcfe83bb44 feat: Update data ingestion and add training reports
Scripts:
- Update ingest_ohlcv_polygon.py for improved data processing

Reports:
- Add attention model training reports (2x)
- Add standard training reports (2x)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 12:24:53 -06:00

480 lines
15 KiB
Python

#!/usr/bin/env python3
"""
OHLCV Data Ingestion from Polygon API → PostgreSQL
====================================================
Downloads historical 5-minute OHLCV data for all ML training symbols
and inserts into PostgreSQL market_data.ohlcv_5m table.
Symbols: XAUUSD, EURUSD, GBPUSD, BTCUSD, USDJPY, GBPJPY, AUDUSD
Usage:
# Set API key
export POLYGON_API_KEY="your_key_here"
# Download all symbols (default: 2020-01-01 to today)
python scripts/ingest_ohlcv_polygon.py
# Download specific symbols
python scripts/ingest_ohlcv_polygon.py --symbols XAUUSD EURUSD
# Custom date range
python scripts/ingest_ohlcv_polygon.py --start 2023-01-01 --end 2025-12-31
# Incremental mode (only fetch from last timestamp in DB)
python scripts/ingest_ohlcv_polygon.py --incremental
Author: ML-Specialist (NEXUS v4.0)
Created: 2026-01-27
"""
import argparse
import asyncio
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
import aiohttp
import psycopg2
import psycopg2.extras
from loguru import logger
# Configure logging
logger.remove()
logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
# ============================================================================
# CONFIGURATION
# ============================================================================
POLYGON_BASE_URL = "https://api.polygon.io"
# Symbol → Polygon ticker prefix mapping
SYMBOL_CONFIG = {
"XAUUSD": {"polygon": "C:XAUUSD", "type": "forex"},
"EURUSD": {"polygon": "C:EURUSD", "type": "forex"},
"GBPUSD": {"polygon": "C:GBPUSD", "type": "forex"},
"BTCUSD": {"polygon": "X:BTCUSD", "type": "crypto"},
"USDJPY": {"polygon": "C:USDJPY", "type": "forex"},
"GBPJPY": {"polygon": "C:GBPJPY", "type": "forex"},
"AUDUSD": {"polygon": "C:AUDUSD", "type": "forex"},
}
# PostgreSQL connection (matches trading_platform config)
# Note: sslmode=disable for local WSL connections to avoid SSL timeout issues
PG_CONFIG = {
"host": os.getenv("DB_HOST", "localhost"),
"port": int(os.getenv("DB_PORT", "5432")),
"dbname": os.getenv("DB_NAME", "trading_platform"),
"user": os.getenv("DB_USER", "trading_user"),
"password": os.getenv("DB_PASSWORD", "trading_dev_2026"),
"sslmode": os.getenv("DB_SSLMODE", "disable"),
"connect_timeout": 30,
}
# ============================================================================
# POLYGON API FETCHER
# ============================================================================
async def fetch_polygon_bars(
api_key: str,
polygon_symbol: str,
start_date: datetime,
end_date: datetime,
multiplier: int = 5,
timespan: str = "minute",
) -> list:
"""
Fetch OHLCV bars from Polygon API with pagination and rate limiting.
Args:
api_key: Polygon API key
polygon_symbol: Full polygon symbol (e.g., 'C:XAUUSD')
start_date: Start date
end_date: End date
multiplier: Timeframe multiplier (5 for 5-min)
timespan: Timeframe span ('minute', 'hour', 'day')
Returns:
List of OHLCV bar dicts
"""
all_bars = []
current_start = start_date
# Chunk by month to respect Polygon's 50k result limit
async with aiohttp.ClientSession() as session:
while current_start < end_date:
chunk_end = min(current_start + timedelta(days=30), end_date)
start_str = current_start.strftime("%Y-%m-%d")
end_str = chunk_end.strftime("%Y-%m-%d")
endpoint = (
f"{POLYGON_BASE_URL}/v2/aggs/ticker/{polygon_symbol}"
f"/range/{multiplier}/{timespan}/{start_str}/{end_str}"
)
params = {
"apiKey": api_key,
"adjusted": "true",
"sort": "asc",
"limit": 50000,
}
try:
async with session.get(endpoint, params=params) as response:
if response.status == 429:
retry_after = int(response.headers.get("Retry-After", 60))
logger.warning(f"Rate limited, waiting {retry_after}s...")
await asyncio.sleep(retry_after)
continue # Retry same chunk
if response.status == 403:
logger.error(f"API key unauthorized (403). Check POLYGON_API_KEY.")
break
if response.status != 200:
text = await response.text()
logger.error(f"API error {response.status}: {text[:200]}")
current_start = chunk_end
continue
data = await response.json()
results = data.get("results", [])
if results:
all_bars.extend(results)
logger.info(
f" {polygon_symbol} {start_str}{end_str}: "
f"{len(results)} bars (total: {len(all_bars)})"
)
else:
logger.debug(f" {polygon_symbol} {start_str}{end_str}: no data")
except aiohttp.ClientError as e:
logger.error(f"Request failed for {polygon_symbol}: {e}")
current_start = chunk_end
await asyncio.sleep(0.5) # Respect rate limits (~2 req/s)
return all_bars
# ============================================================================
# POSTGRESQL INSERTER
# ============================================================================
def get_ticker_id(conn, symbol: str) -> int:
"""
Get ticker_id from market_data.tickers table.
Args:
conn: psycopg2 connection
symbol: Symbol name (e.g., 'XAUUSD')
Returns:
Ticker ID
Raises:
ValueError if ticker not found
"""
with conn.cursor() as cur:
cur.execute(
"SELECT id FROM market_data.tickers WHERE UPPER(symbol) = UPPER(%s)",
(symbol,),
)
row = cur.fetchone()
if not row:
raise ValueError(f"Ticker '{symbol}' not found in market_data.tickers")
return row[0]
def get_last_timestamp(conn, ticker_id: int) -> datetime:
"""Get the last ingested timestamp for a ticker (for incremental sync)."""
with conn.cursor() as cur:
cur.execute(
"SELECT MAX(timestamp) FROM market_data.ohlcv_5m WHERE ticker_id = %s",
(ticker_id,),
)
row = cur.fetchone()
return row[0] if row and row[0] else None
def insert_bars_to_postgres(ticker_id: int, bars: list) -> int:
"""
Insert OHLCV bars into PostgreSQL market_data.ohlcv_5m.
Uses ON CONFLICT for upsert behavior. Manages its own connection.
Args:
ticker_id: Ticker ID from market_data.tickers
bars: List of Polygon API bar dicts
Returns:
Number of rows inserted/updated
"""
if not bars:
return 0
insert_sql = """
INSERT INTO market_data.ohlcv_5m
(ticker_id, timestamp, open, high, low, close, volume, vwap, ts_epoch)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (ticker_id, timestamp) DO UPDATE SET
open = EXCLUDED.open,
high = EXCLUDED.high,
low = EXCLUDED.low,
close = EXCLUDED.close,
volume = EXCLUDED.volume,
vwap = EXCLUDED.vwap
"""
# Convert bars to tuples
rows = []
for bar in bars:
ts_ms = bar["t"] # milliseconds epoch
timestamp = datetime.fromtimestamp(ts_ms / 1000)
rows.append((
ticker_id,
timestamp,
bar["o"], # open
bar["h"], # high
bar["l"], # low
bar["c"], # close
bar.get("v", 0), # volume
bar.get("vw") or None, # vwap (nullable)
ts_ms, # ts_epoch in ms
))
# Insert in smaller batches with fresh connection per batch group
batch_size = 500 # Small batches for reliability
total_inserted = 0
conn = None
try:
conn = psycopg2.connect(**PG_CONFIG)
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
retries = 3
while retries > 0:
try:
with conn.cursor() as cur:
psycopg2.extras.execute_batch(cur, insert_sql, batch, page_size=100)
conn.commit()
total_inserted += len(batch)
break
except psycopg2.OperationalError as e:
retries -= 1
logger.warning(f" DB error ({3 - retries}/3): {str(e)[:50]}")
try:
conn.close()
except Exception:
pass
conn = psycopg2.connect(**PG_CONFIG)
if retries == 0:
raise
# Progress every 10 batches
batch_num = i // batch_size + 1
if batch_num % 20 == 0:
logger.info(f" Progress: {total_inserted:,} / {len(rows):,} rows")
finally:
if conn:
try:
conn.close()
except Exception:
pass
return total_inserted
# ============================================================================
# MAIN
# ============================================================================
async def ingest_symbol(
api_key: str,
symbol: str,
start_date: datetime,
end_date: datetime,
incremental: bool = False,
) -> dict:
"""
Full ingestion pipeline for a single symbol.
Uses fresh DB connection for each symbol to avoid stale connections.
Returns:
Dict with results summary
"""
config = SYMBOL_CONFIG.get(symbol)
if not config:
logger.error(f"Unknown symbol: {symbol}")
return {"symbol": symbol, "status": "error", "error": "Unknown symbol"}
polygon_symbol = config["polygon"]
# Fresh connection for this symbol
try:
conn = psycopg2.connect(**PG_CONFIG)
except Exception as e:
logger.error(f"PostgreSQL connection failed: {e}")
return {"symbol": symbol, "status": "error", "error": str(e)}
try:
# Get ticker ID from DB
try:
ticker_id = get_ticker_id(conn, symbol)
except ValueError as e:
logger.error(str(e))
return {"symbol": symbol, "status": "error", "error": str(e)}
# In incremental mode, start from last timestamp + 5 min
actual_start = start_date
if incremental:
last_ts = get_last_timestamp(conn, ticker_id)
if last_ts:
actual_start = last_ts + timedelta(minutes=5)
logger.info(f" Incremental: continuing from {actual_start}")
if actual_start >= end_date:
logger.info(f" {symbol}: already up to date")
return {"symbol": symbol, "status": "up_to_date", "rows": 0}
# Fetch from Polygon
logger.info(f"Fetching {symbol} ({polygon_symbol}) from {actual_start.date()} to {end_date.date()}...")
bars = await fetch_polygon_bars(
api_key=api_key,
polygon_symbol=polygon_symbol,
start_date=actual_start,
end_date=end_date,
)
if not bars:
logger.warning(f" {symbol}: no data received from API")
return {"symbol": symbol, "status": "no_data", "rows": 0}
# Show range
first_ts = datetime.fromtimestamp(bars[0]["t"] / 1000)
last_ts = datetime.fromtimestamp(bars[-1]["t"] / 1000)
logger.info(f" {symbol}: {len(bars)} bars from {first_ts} to {last_ts}")
# Insert into PostgreSQL (uses its own connection)
conn.close() # Close read connection before bulk insert
logger.info(f" Inserting {len(bars)} bars into market_data.ohlcv_5m...")
inserted = insert_bars_to_postgres(ticker_id, bars)
logger.info(f" {symbol}: {inserted} rows inserted/updated")
return {
"symbol": symbol,
"status": "success",
"rows": inserted,
"first_bar": str(first_ts),
"last_bar": str(last_ts),
}
finally:
try:
conn.close()
except Exception:
pass
async def main():
parser = argparse.ArgumentParser(description="Ingest OHLCV data from Polygon → PostgreSQL")
parser.add_argument(
"--symbols",
nargs="+",
default=list(SYMBOL_CONFIG.keys()),
help=f"Symbols to ingest (default: all {len(SYMBOL_CONFIG)})",
)
parser.add_argument(
"--start",
type=str,
default="2020-01-01",
help="Start date YYYY-MM-DD (default: 2020-01-01)",
)
parser.add_argument(
"--end",
type=str,
default=datetime.now().strftime("%Y-%m-%d"),
help="End date YYYY-MM-DD (default: today)",
)
parser.add_argument(
"--incremental",
action="store_true",
help="Only fetch data after last timestamp in DB",
)
parser.add_argument(
"--api-key",
type=str,
default=os.getenv("POLYGON_API_KEY", ""),
help="Polygon API key (or set POLYGON_API_KEY env var)",
)
args = parser.parse_args()
# Validate API key
api_key = args.api_key
if not api_key:
logger.error("POLYGON_API_KEY is required. Set env var or use --api-key")
sys.exit(1)
start_date = datetime.strptime(args.start, "%Y-%m-%d")
end_date = datetime.strptime(args.end, "%Y-%m-%d")
logger.info("=" * 60)
logger.info("OHLCV Data Ingestion: Polygon → PostgreSQL")
logger.info("=" * 60)
logger.info(f"Symbols: {args.symbols}")
logger.info(f"Date range: {start_date.date()}{end_date.date()}")
logger.info(f"Incremental: {args.incremental}")
logger.info(f"Database: {PG_CONFIG['host']}:{PG_CONFIG['port']}/{PG_CONFIG['dbname']}")
# Test PostgreSQL connectivity
try:
test_conn = psycopg2.connect(**PG_CONFIG)
test_conn.close()
logger.info("PostgreSQL connectivity verified")
except Exception as e:
logger.error(f"PostgreSQL connection failed: {e}")
sys.exit(1)
# Process each symbol (each gets its own connection)
results = []
for symbol in args.symbols:
logger.info(f"\n{'='*40}")
logger.info(f"Processing {symbol}")
logger.info(f"{'='*40}")
result = await ingest_symbol(
api_key=api_key,
symbol=symbol,
start_date=start_date,
end_date=end_date,
incremental=args.incremental,
)
results.append(result)
# Print summary
logger.info("\n" + "=" * 60)
logger.info("INGESTION SUMMARY")
logger.info("=" * 60)
total_rows = 0
for r in results:
status_icon = {
"success": "[OK]",
"no_data": "[EMPTY]",
"up_to_date": "[SKIP]",
"error": "[FAIL]",
}.get(r["status"], "[?]")
rows = r.get("rows", 0)
total_rows += rows
logger.info(f" {status_icon} {r['symbol']}: {rows:,} rows")
logger.info(f"\nTotal rows inserted: {total_rows:,}")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())