#!/usr/bin/env python3 """ OHLCV Data Ingestion from Polygon API → PostgreSQL ==================================================== Downloads historical 5-minute OHLCV data for all ML training symbols and inserts into PostgreSQL market_data.ohlcv_5m table. Symbols: XAUUSD, EURUSD, GBPUSD, BTCUSD, USDJPY, GBPJPY, AUDUSD Usage: # Set API key export POLYGON_API_KEY="your_key_here" # Download all symbols (default: 2020-01-01 to today) python scripts/ingest_ohlcv_polygon.py # Download specific symbols python scripts/ingest_ohlcv_polygon.py --symbols XAUUSD EURUSD # Custom date range python scripts/ingest_ohlcv_polygon.py --start 2023-01-01 --end 2025-12-31 # Incremental mode (only fetch from last timestamp in DB) python scripts/ingest_ohlcv_polygon.py --incremental Author: ML-Specialist (NEXUS v4.0) Created: 2026-01-27 """ import argparse import asyncio import os import sys from datetime import datetime, timedelta from pathlib import Path import aiohttp import psycopg2 import psycopg2.extras from loguru import logger # Configure logging logger.remove() logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}") # ============================================================================ # CONFIGURATION # ============================================================================ POLYGON_BASE_URL = "https://api.polygon.io" # Symbol → Polygon ticker prefix mapping SYMBOL_CONFIG = { "XAUUSD": {"polygon": "C:XAUUSD", "type": "forex"}, "EURUSD": {"polygon": "C:EURUSD", "type": "forex"}, "GBPUSD": {"polygon": "C:GBPUSD", "type": "forex"}, "BTCUSD": {"polygon": "X:BTCUSD", "type": "crypto"}, "USDJPY": {"polygon": "C:USDJPY", "type": "forex"}, "GBPJPY": {"polygon": "C:GBPJPY", "type": "forex"}, "AUDUSD": {"polygon": "C:AUDUSD", "type": "forex"}, } # PostgreSQL connection (matches trading_platform config) PG_CONFIG = { "host": os.getenv("DB_HOST", "localhost"), "port": int(os.getenv("DB_PORT", "5432")), "dbname": os.getenv("DB_NAME", "trading_platform"), "user": os.getenv("DB_USER", "trading_user"), "password": os.getenv("DB_PASSWORD", "trading_dev_2026"), } # ============================================================================ # POLYGON API FETCHER # ============================================================================ async def fetch_polygon_bars( api_key: str, polygon_symbol: str, start_date: datetime, end_date: datetime, multiplier: int = 5, timespan: str = "minute", ) -> list: """ Fetch OHLCV bars from Polygon API with pagination and rate limiting. Args: api_key: Polygon API key polygon_symbol: Full polygon symbol (e.g., 'C:XAUUSD') start_date: Start date end_date: End date multiplier: Timeframe multiplier (5 for 5-min) timespan: Timeframe span ('minute', 'hour', 'day') Returns: List of OHLCV bar dicts """ all_bars = [] current_start = start_date # Chunk by month to respect Polygon's 50k result limit async with aiohttp.ClientSession() as session: while current_start < end_date: chunk_end = min(current_start + timedelta(days=30), end_date) start_str = current_start.strftime("%Y-%m-%d") end_str = chunk_end.strftime("%Y-%m-%d") endpoint = ( f"{POLYGON_BASE_URL}/v2/aggs/ticker/{polygon_symbol}" f"/range/{multiplier}/{timespan}/{start_str}/{end_str}" ) params = { "apiKey": api_key, "adjusted": "true", "sort": "asc", "limit": 50000, } try: async with session.get(endpoint, params=params) as response: if response.status == 429: retry_after = int(response.headers.get("Retry-After", 60)) logger.warning(f"Rate limited, waiting {retry_after}s...") await asyncio.sleep(retry_after) continue # Retry same chunk if response.status == 403: logger.error(f"API key unauthorized (403). Check POLYGON_API_KEY.") break if response.status != 200: text = await response.text() logger.error(f"API error {response.status}: {text[:200]}") current_start = chunk_end continue data = await response.json() results = data.get("results", []) if results: all_bars.extend(results) logger.info( f" {polygon_symbol} {start_str}→{end_str}: " f"{len(results)} bars (total: {len(all_bars)})" ) else: logger.debug(f" {polygon_symbol} {start_str}→{end_str}: no data") except aiohttp.ClientError as e: logger.error(f"Request failed for {polygon_symbol}: {e}") current_start = chunk_end await asyncio.sleep(0.5) # Respect rate limits (~2 req/s) return all_bars # ============================================================================ # POSTGRESQL INSERTER # ============================================================================ def get_ticker_id(conn, symbol: str) -> int: """ Get ticker_id from market_data.tickers table. Args: conn: psycopg2 connection symbol: Symbol name (e.g., 'XAUUSD') Returns: Ticker ID Raises: ValueError if ticker not found """ with conn.cursor() as cur: cur.execute( "SELECT id FROM market_data.tickers WHERE UPPER(symbol) = UPPER(%s)", (symbol,), ) row = cur.fetchone() if not row: raise ValueError(f"Ticker '{symbol}' not found in market_data.tickers") return row[0] def get_last_timestamp(conn, ticker_id: int) -> datetime: """Get the last ingested timestamp for a ticker (for incremental sync).""" with conn.cursor() as cur: cur.execute( "SELECT MAX(timestamp) FROM market_data.ohlcv_5m WHERE ticker_id = %s", (ticker_id,), ) row = cur.fetchone() return row[0] if row and row[0] else None def insert_bars_to_postgres(conn, ticker_id: int, bars: list) -> int: """ Insert OHLCV bars into PostgreSQL market_data.ohlcv_5m. Uses ON CONFLICT for upsert behavior. Args: conn: psycopg2 connection ticker_id: Ticker ID from market_data.tickers bars: List of Polygon API bar dicts Returns: Number of rows inserted/updated """ if not bars: return 0 insert_sql = """ INSERT INTO market_data.ohlcv_5m (ticker_id, timestamp, open, high, low, close, volume, vwap, ts_epoch) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (ticker_id, timestamp) DO UPDATE SET open = EXCLUDED.open, high = EXCLUDED.high, low = EXCLUDED.low, close = EXCLUDED.close, volume = EXCLUDED.volume, vwap = EXCLUDED.vwap """ # Convert bars to tuples rows = [] for bar in bars: ts_ms = bar["t"] # milliseconds epoch timestamp = datetime.fromtimestamp(ts_ms / 1000) rows.append(( ticker_id, timestamp, bar["o"], # open bar["h"], # high bar["l"], # low bar["c"], # close bar.get("v", 0), # volume bar.get("vw") or None, # vwap (nullable) ts_ms, # ts_epoch in ms )) # Insert in batches batch_size = 5000 total_inserted = 0 with conn.cursor() as cur: for i in range(0, len(rows), batch_size): batch = rows[i : i + batch_size] psycopg2.extras.execute_batch(cur, insert_sql, batch, page_size=1000) conn.commit() total_inserted += len(batch) return total_inserted # ============================================================================ # MAIN # ============================================================================ async def ingest_symbol( api_key: str, conn, symbol: str, start_date: datetime, end_date: datetime, incremental: bool = False, ) -> dict: """ Full ingestion pipeline for a single symbol. Returns: Dict with results summary """ config = SYMBOL_CONFIG.get(symbol) if not config: logger.error(f"Unknown symbol: {symbol}") return {"symbol": symbol, "status": "error", "error": "Unknown symbol"} polygon_symbol = config["polygon"] # Get ticker ID from DB try: ticker_id = get_ticker_id(conn, symbol) except ValueError as e: logger.error(str(e)) return {"symbol": symbol, "status": "error", "error": str(e)} # In incremental mode, start from last timestamp + 5 min actual_start = start_date if incremental: last_ts = get_last_timestamp(conn, ticker_id) if last_ts: actual_start = last_ts + timedelta(minutes=5) logger.info(f" Incremental: continuing from {actual_start}") if actual_start >= end_date: logger.info(f" {symbol}: already up to date") return {"symbol": symbol, "status": "up_to_date", "rows": 0} # Fetch from Polygon logger.info(f"Fetching {symbol} ({polygon_symbol}) from {actual_start.date()} to {end_date.date()}...") bars = await fetch_polygon_bars( api_key=api_key, polygon_symbol=polygon_symbol, start_date=actual_start, end_date=end_date, ) if not bars: logger.warning(f" {symbol}: no data received from API") return {"symbol": symbol, "status": "no_data", "rows": 0} # Show range first_ts = datetime.fromtimestamp(bars[0]["t"] / 1000) last_ts = datetime.fromtimestamp(bars[-1]["t"] / 1000) logger.info(f" {symbol}: {len(bars)} bars from {first_ts} to {last_ts}") # Insert into PostgreSQL logger.info(f" Inserting {len(bars)} bars into market_data.ohlcv_5m...") inserted = insert_bars_to_postgres(conn, ticker_id, bars) logger.info(f" {symbol}: {inserted} rows inserted/updated") return { "symbol": symbol, "status": "success", "rows": inserted, "first_bar": str(first_ts), "last_bar": str(last_ts), } async def main(): parser = argparse.ArgumentParser(description="Ingest OHLCV data from Polygon → PostgreSQL") parser.add_argument( "--symbols", nargs="+", default=list(SYMBOL_CONFIG.keys()), help=f"Symbols to ingest (default: all {len(SYMBOL_CONFIG)})", ) parser.add_argument( "--start", type=str, default="2020-01-01", help="Start date YYYY-MM-DD (default: 2020-01-01)", ) parser.add_argument( "--end", type=str, default=datetime.now().strftime("%Y-%m-%d"), help="End date YYYY-MM-DD (default: today)", ) parser.add_argument( "--incremental", action="store_true", help="Only fetch data after last timestamp in DB", ) parser.add_argument( "--api-key", type=str, default=os.getenv("POLYGON_API_KEY", ""), help="Polygon API key (or set POLYGON_API_KEY env var)", ) args = parser.parse_args() # Validate API key api_key = args.api_key if not api_key: logger.error("POLYGON_API_KEY is required. Set env var or use --api-key") sys.exit(1) start_date = datetime.strptime(args.start, "%Y-%m-%d") end_date = datetime.strptime(args.end, "%Y-%m-%d") logger.info("=" * 60) logger.info("OHLCV Data Ingestion: Polygon → PostgreSQL") logger.info("=" * 60) logger.info(f"Symbols: {args.symbols}") logger.info(f"Date range: {start_date.date()} → {end_date.date()}") logger.info(f"Incremental: {args.incremental}") logger.info(f"Database: {PG_CONFIG['host']}:{PG_CONFIG['port']}/{PG_CONFIG['dbname']}") # Connect to PostgreSQL try: conn = psycopg2.connect(**PG_CONFIG) logger.info("PostgreSQL connected") except Exception as e: logger.error(f"PostgreSQL connection failed: {e}") sys.exit(1) # Process each symbol results = [] for symbol in args.symbols: logger.info(f"\n{'='*40}") logger.info(f"Processing {symbol}") logger.info(f"{'='*40}") result = await ingest_symbol( api_key=api_key, conn=conn, symbol=symbol, start_date=start_date, end_date=end_date, incremental=args.incremental, ) results.append(result) conn.close() # Print summary logger.info("\n" + "=" * 60) logger.info("INGESTION SUMMARY") logger.info("=" * 60) total_rows = 0 for r in results: status_icon = { "success": "[OK]", "no_data": "[EMPTY]", "up_to_date": "[SKIP]", "error": "[FAIL]", }.get(r["status"], "[?]") rows = r.get("rows", 0) total_rows += rows logger.info(f" {status_icon} {r['symbol']}: {rows:,} rows") logger.info(f"\nTotal rows inserted: {total_rows:,}") logger.info("=" * 60) if __name__ == "__main__": asyncio.run(main())