Scripts: - Update ingest_ohlcv_polygon.py for improved data processing Reports: - Add attention model training reports (2x) - Add standard training reports (2x) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
480 lines
15 KiB
Python
480 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
OHLCV Data Ingestion from Polygon API → PostgreSQL
|
|
====================================================
|
|
Downloads historical 5-minute OHLCV data for all ML training symbols
|
|
and inserts into PostgreSQL market_data.ohlcv_5m table.
|
|
|
|
Symbols: XAUUSD, EURUSD, GBPUSD, BTCUSD, USDJPY, GBPJPY, AUDUSD
|
|
|
|
Usage:
|
|
# Set API key
|
|
export POLYGON_API_KEY="your_key_here"
|
|
|
|
# Download all symbols (default: 2020-01-01 to today)
|
|
python scripts/ingest_ohlcv_polygon.py
|
|
|
|
# Download specific symbols
|
|
python scripts/ingest_ohlcv_polygon.py --symbols XAUUSD EURUSD
|
|
|
|
# Custom date range
|
|
python scripts/ingest_ohlcv_polygon.py --start 2023-01-01 --end 2025-12-31
|
|
|
|
# Incremental mode (only fetch from last timestamp in DB)
|
|
python scripts/ingest_ohlcv_polygon.py --incremental
|
|
|
|
Author: ML-Specialist (NEXUS v4.0)
|
|
Created: 2026-01-27
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
import aiohttp
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
from loguru import logger
|
|
|
|
# Configure logging
|
|
logger.remove()
|
|
logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION
|
|
# ============================================================================
|
|
|
|
POLYGON_BASE_URL = "https://api.polygon.io"
|
|
|
|
# Symbol → Polygon ticker prefix mapping
|
|
SYMBOL_CONFIG = {
|
|
"XAUUSD": {"polygon": "C:XAUUSD", "type": "forex"},
|
|
"EURUSD": {"polygon": "C:EURUSD", "type": "forex"},
|
|
"GBPUSD": {"polygon": "C:GBPUSD", "type": "forex"},
|
|
"BTCUSD": {"polygon": "X:BTCUSD", "type": "crypto"},
|
|
"USDJPY": {"polygon": "C:USDJPY", "type": "forex"},
|
|
"GBPJPY": {"polygon": "C:GBPJPY", "type": "forex"},
|
|
"AUDUSD": {"polygon": "C:AUDUSD", "type": "forex"},
|
|
}
|
|
|
|
# PostgreSQL connection (matches trading_platform config)
|
|
# Note: sslmode=disable for local WSL connections to avoid SSL timeout issues
|
|
PG_CONFIG = {
|
|
"host": os.getenv("DB_HOST", "localhost"),
|
|
"port": int(os.getenv("DB_PORT", "5432")),
|
|
"dbname": os.getenv("DB_NAME", "trading_platform"),
|
|
"user": os.getenv("DB_USER", "trading_user"),
|
|
"password": os.getenv("DB_PASSWORD", "trading_dev_2026"),
|
|
"sslmode": os.getenv("DB_SSLMODE", "disable"),
|
|
"connect_timeout": 30,
|
|
}
|
|
|
|
|
|
# ============================================================================
|
|
# POLYGON API FETCHER
|
|
# ============================================================================
|
|
|
|
async def fetch_polygon_bars(
|
|
api_key: str,
|
|
polygon_symbol: str,
|
|
start_date: datetime,
|
|
end_date: datetime,
|
|
multiplier: int = 5,
|
|
timespan: str = "minute",
|
|
) -> list:
|
|
"""
|
|
Fetch OHLCV bars from Polygon API with pagination and rate limiting.
|
|
|
|
Args:
|
|
api_key: Polygon API key
|
|
polygon_symbol: Full polygon symbol (e.g., 'C:XAUUSD')
|
|
start_date: Start date
|
|
end_date: End date
|
|
multiplier: Timeframe multiplier (5 for 5-min)
|
|
timespan: Timeframe span ('minute', 'hour', 'day')
|
|
|
|
Returns:
|
|
List of OHLCV bar dicts
|
|
"""
|
|
all_bars = []
|
|
current_start = start_date
|
|
|
|
# Chunk by month to respect Polygon's 50k result limit
|
|
async with aiohttp.ClientSession() as session:
|
|
while current_start < end_date:
|
|
chunk_end = min(current_start + timedelta(days=30), end_date)
|
|
|
|
start_str = current_start.strftime("%Y-%m-%d")
|
|
end_str = chunk_end.strftime("%Y-%m-%d")
|
|
|
|
endpoint = (
|
|
f"{POLYGON_BASE_URL}/v2/aggs/ticker/{polygon_symbol}"
|
|
f"/range/{multiplier}/{timespan}/{start_str}/{end_str}"
|
|
)
|
|
|
|
params = {
|
|
"apiKey": api_key,
|
|
"adjusted": "true",
|
|
"sort": "asc",
|
|
"limit": 50000,
|
|
}
|
|
|
|
try:
|
|
async with session.get(endpoint, params=params) as response:
|
|
if response.status == 429:
|
|
retry_after = int(response.headers.get("Retry-After", 60))
|
|
logger.warning(f"Rate limited, waiting {retry_after}s...")
|
|
await asyncio.sleep(retry_after)
|
|
continue # Retry same chunk
|
|
|
|
if response.status == 403:
|
|
logger.error(f"API key unauthorized (403). Check POLYGON_API_KEY.")
|
|
break
|
|
|
|
if response.status != 200:
|
|
text = await response.text()
|
|
logger.error(f"API error {response.status}: {text[:200]}")
|
|
current_start = chunk_end
|
|
continue
|
|
|
|
data = await response.json()
|
|
results = data.get("results", [])
|
|
|
|
if results:
|
|
all_bars.extend(results)
|
|
logger.info(
|
|
f" {polygon_symbol} {start_str}→{end_str}: "
|
|
f"{len(results)} bars (total: {len(all_bars)})"
|
|
)
|
|
else:
|
|
logger.debug(f" {polygon_symbol} {start_str}→{end_str}: no data")
|
|
|
|
except aiohttp.ClientError as e:
|
|
logger.error(f"Request failed for {polygon_symbol}: {e}")
|
|
|
|
current_start = chunk_end
|
|
await asyncio.sleep(0.5) # Respect rate limits (~2 req/s)
|
|
|
|
return all_bars
|
|
|
|
|
|
# ============================================================================
|
|
# POSTGRESQL INSERTER
|
|
# ============================================================================
|
|
|
|
def get_ticker_id(conn, symbol: str) -> int:
|
|
"""
|
|
Get ticker_id from market_data.tickers table.
|
|
|
|
Args:
|
|
conn: psycopg2 connection
|
|
symbol: Symbol name (e.g., 'XAUUSD')
|
|
|
|
Returns:
|
|
Ticker ID
|
|
|
|
Raises:
|
|
ValueError if ticker not found
|
|
"""
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"SELECT id FROM market_data.tickers WHERE UPPER(symbol) = UPPER(%s)",
|
|
(symbol,),
|
|
)
|
|
row = cur.fetchone()
|
|
if not row:
|
|
raise ValueError(f"Ticker '{symbol}' not found in market_data.tickers")
|
|
return row[0]
|
|
|
|
|
|
def get_last_timestamp(conn, ticker_id: int) -> datetime:
|
|
"""Get the last ingested timestamp for a ticker (for incremental sync)."""
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"SELECT MAX(timestamp) FROM market_data.ohlcv_5m WHERE ticker_id = %s",
|
|
(ticker_id,),
|
|
)
|
|
row = cur.fetchone()
|
|
return row[0] if row and row[0] else None
|
|
|
|
|
|
def insert_bars_to_postgres(ticker_id: int, bars: list) -> int:
|
|
"""
|
|
Insert OHLCV bars into PostgreSQL market_data.ohlcv_5m.
|
|
Uses ON CONFLICT for upsert behavior. Manages its own connection.
|
|
|
|
Args:
|
|
ticker_id: Ticker ID from market_data.tickers
|
|
bars: List of Polygon API bar dicts
|
|
|
|
Returns:
|
|
Number of rows inserted/updated
|
|
"""
|
|
if not bars:
|
|
return 0
|
|
|
|
insert_sql = """
|
|
INSERT INTO market_data.ohlcv_5m
|
|
(ticker_id, timestamp, open, high, low, close, volume, vwap, ts_epoch)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (ticker_id, timestamp) DO UPDATE SET
|
|
open = EXCLUDED.open,
|
|
high = EXCLUDED.high,
|
|
low = EXCLUDED.low,
|
|
close = EXCLUDED.close,
|
|
volume = EXCLUDED.volume,
|
|
vwap = EXCLUDED.vwap
|
|
"""
|
|
|
|
# Convert bars to tuples
|
|
rows = []
|
|
for bar in bars:
|
|
ts_ms = bar["t"] # milliseconds epoch
|
|
timestamp = datetime.fromtimestamp(ts_ms / 1000)
|
|
rows.append((
|
|
ticker_id,
|
|
timestamp,
|
|
bar["o"], # open
|
|
bar["h"], # high
|
|
bar["l"], # low
|
|
bar["c"], # close
|
|
bar.get("v", 0), # volume
|
|
bar.get("vw") or None, # vwap (nullable)
|
|
ts_ms, # ts_epoch in ms
|
|
))
|
|
|
|
# Insert in smaller batches with fresh connection per batch group
|
|
batch_size = 500 # Small batches for reliability
|
|
total_inserted = 0
|
|
conn = None
|
|
|
|
try:
|
|
conn = psycopg2.connect(**PG_CONFIG)
|
|
for i in range(0, len(rows), batch_size):
|
|
batch = rows[i : i + batch_size]
|
|
retries = 3
|
|
while retries > 0:
|
|
try:
|
|
with conn.cursor() as cur:
|
|
psycopg2.extras.execute_batch(cur, insert_sql, batch, page_size=100)
|
|
conn.commit()
|
|
total_inserted += len(batch)
|
|
break
|
|
except psycopg2.OperationalError as e:
|
|
retries -= 1
|
|
logger.warning(f" DB error ({3 - retries}/3): {str(e)[:50]}")
|
|
try:
|
|
conn.close()
|
|
except Exception:
|
|
pass
|
|
conn = psycopg2.connect(**PG_CONFIG)
|
|
if retries == 0:
|
|
raise
|
|
# Progress every 10 batches
|
|
batch_num = i // batch_size + 1
|
|
if batch_num % 20 == 0:
|
|
logger.info(f" Progress: {total_inserted:,} / {len(rows):,} rows")
|
|
finally:
|
|
if conn:
|
|
try:
|
|
conn.close()
|
|
except Exception:
|
|
pass
|
|
|
|
return total_inserted
|
|
|
|
|
|
# ============================================================================
|
|
# MAIN
|
|
# ============================================================================
|
|
|
|
async def ingest_symbol(
|
|
api_key: str,
|
|
symbol: str,
|
|
start_date: datetime,
|
|
end_date: datetime,
|
|
incremental: bool = False,
|
|
) -> dict:
|
|
"""
|
|
Full ingestion pipeline for a single symbol.
|
|
Uses fresh DB connection for each symbol to avoid stale connections.
|
|
|
|
Returns:
|
|
Dict with results summary
|
|
"""
|
|
config = SYMBOL_CONFIG.get(symbol)
|
|
if not config:
|
|
logger.error(f"Unknown symbol: {symbol}")
|
|
return {"symbol": symbol, "status": "error", "error": "Unknown symbol"}
|
|
|
|
polygon_symbol = config["polygon"]
|
|
|
|
# Fresh connection for this symbol
|
|
try:
|
|
conn = psycopg2.connect(**PG_CONFIG)
|
|
except Exception as e:
|
|
logger.error(f"PostgreSQL connection failed: {e}")
|
|
return {"symbol": symbol, "status": "error", "error": str(e)}
|
|
|
|
try:
|
|
# Get ticker ID from DB
|
|
try:
|
|
ticker_id = get_ticker_id(conn, symbol)
|
|
except ValueError as e:
|
|
logger.error(str(e))
|
|
return {"symbol": symbol, "status": "error", "error": str(e)}
|
|
|
|
# In incremental mode, start from last timestamp + 5 min
|
|
actual_start = start_date
|
|
if incremental:
|
|
last_ts = get_last_timestamp(conn, ticker_id)
|
|
if last_ts:
|
|
actual_start = last_ts + timedelta(minutes=5)
|
|
logger.info(f" Incremental: continuing from {actual_start}")
|
|
|
|
if actual_start >= end_date:
|
|
logger.info(f" {symbol}: already up to date")
|
|
return {"symbol": symbol, "status": "up_to_date", "rows": 0}
|
|
|
|
# Fetch from Polygon
|
|
logger.info(f"Fetching {symbol} ({polygon_symbol}) from {actual_start.date()} to {end_date.date()}...")
|
|
bars = await fetch_polygon_bars(
|
|
api_key=api_key,
|
|
polygon_symbol=polygon_symbol,
|
|
start_date=actual_start,
|
|
end_date=end_date,
|
|
)
|
|
|
|
if not bars:
|
|
logger.warning(f" {symbol}: no data received from API")
|
|
return {"symbol": symbol, "status": "no_data", "rows": 0}
|
|
|
|
# Show range
|
|
first_ts = datetime.fromtimestamp(bars[0]["t"] / 1000)
|
|
last_ts = datetime.fromtimestamp(bars[-1]["t"] / 1000)
|
|
logger.info(f" {symbol}: {len(bars)} bars from {first_ts} to {last_ts}")
|
|
|
|
# Insert into PostgreSQL (uses its own connection)
|
|
conn.close() # Close read connection before bulk insert
|
|
logger.info(f" Inserting {len(bars)} bars into market_data.ohlcv_5m...")
|
|
inserted = insert_bars_to_postgres(ticker_id, bars)
|
|
logger.info(f" {symbol}: {inserted} rows inserted/updated")
|
|
|
|
return {
|
|
"symbol": symbol,
|
|
"status": "success",
|
|
"rows": inserted,
|
|
"first_bar": str(first_ts),
|
|
"last_bar": str(last_ts),
|
|
}
|
|
finally:
|
|
try:
|
|
conn.close()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
async def main():
|
|
parser = argparse.ArgumentParser(description="Ingest OHLCV data from Polygon → PostgreSQL")
|
|
parser.add_argument(
|
|
"--symbols",
|
|
nargs="+",
|
|
default=list(SYMBOL_CONFIG.keys()),
|
|
help=f"Symbols to ingest (default: all {len(SYMBOL_CONFIG)})",
|
|
)
|
|
parser.add_argument(
|
|
"--start",
|
|
type=str,
|
|
default="2020-01-01",
|
|
help="Start date YYYY-MM-DD (default: 2020-01-01)",
|
|
)
|
|
parser.add_argument(
|
|
"--end",
|
|
type=str,
|
|
default=datetime.now().strftime("%Y-%m-%d"),
|
|
help="End date YYYY-MM-DD (default: today)",
|
|
)
|
|
parser.add_argument(
|
|
"--incremental",
|
|
action="store_true",
|
|
help="Only fetch data after last timestamp in DB",
|
|
)
|
|
parser.add_argument(
|
|
"--api-key",
|
|
type=str,
|
|
default=os.getenv("POLYGON_API_KEY", ""),
|
|
help="Polygon API key (or set POLYGON_API_KEY env var)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Validate API key
|
|
api_key = args.api_key
|
|
if not api_key:
|
|
logger.error("POLYGON_API_KEY is required. Set env var or use --api-key")
|
|
sys.exit(1)
|
|
|
|
start_date = datetime.strptime(args.start, "%Y-%m-%d")
|
|
end_date = datetime.strptime(args.end, "%Y-%m-%d")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("OHLCV Data Ingestion: Polygon → PostgreSQL")
|
|
logger.info("=" * 60)
|
|
logger.info(f"Symbols: {args.symbols}")
|
|
logger.info(f"Date range: {start_date.date()} → {end_date.date()}")
|
|
logger.info(f"Incremental: {args.incremental}")
|
|
logger.info(f"Database: {PG_CONFIG['host']}:{PG_CONFIG['port']}/{PG_CONFIG['dbname']}")
|
|
|
|
# Test PostgreSQL connectivity
|
|
try:
|
|
test_conn = psycopg2.connect(**PG_CONFIG)
|
|
test_conn.close()
|
|
logger.info("PostgreSQL connectivity verified")
|
|
except Exception as e:
|
|
logger.error(f"PostgreSQL connection failed: {e}")
|
|
sys.exit(1)
|
|
|
|
# Process each symbol (each gets its own connection)
|
|
results = []
|
|
for symbol in args.symbols:
|
|
logger.info(f"\n{'='*40}")
|
|
logger.info(f"Processing {symbol}")
|
|
logger.info(f"{'='*40}")
|
|
|
|
result = await ingest_symbol(
|
|
api_key=api_key,
|
|
symbol=symbol,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
incremental=args.incremental,
|
|
)
|
|
results.append(result)
|
|
|
|
# Print summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("INGESTION SUMMARY")
|
|
logger.info("=" * 60)
|
|
|
|
total_rows = 0
|
|
for r in results:
|
|
status_icon = {
|
|
"success": "[OK]",
|
|
"no_data": "[EMPTY]",
|
|
"up_to_date": "[SKIP]",
|
|
"error": "[FAIL]",
|
|
}.get(r["status"], "[?]")
|
|
|
|
rows = r.get("rows", 0)
|
|
total_rows += rows
|
|
logger.info(f" {status_icon} {r['symbol']}: {rows:,} rows")
|
|
|
|
logger.info(f"\nTotal rows inserted: {total_rows:,}")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|