feat: Add Polygon data fetch script
- scripts/fetch_polygon_data.py: Async script to fetch OHLCV data from Polygon API - Supports backfilling historical data (default 365 days) - Rate limiting for free tier (5 req/min) - Batch inserts with ON CONFLICT handling - Configurable via .env file Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
62a9f3e1d9
commit
0e20c7c75c
349
scripts/fetch_polygon_data.py
Normal file
349
scripts/fetch_polygon_data.py
Normal file
@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Polygon Data Fetcher - Trading Platform
|
||||
Fetches historical OHLCV data from Polygon.io and stores in PostgreSQL.
|
||||
|
||||
Usage:
|
||||
~/venvs/data-service/bin/python scripts/fetch_polygon_data.py --days 30
|
||||
|
||||
Environment variables:
|
||||
POLYGON_API_KEY - Polygon.io API key
|
||||
DB_HOST - PostgreSQL host (default: localhost)
|
||||
DB_PORT - PostgreSQL port (default: 5432)
|
||||
DB_NAME - Database name (default: trading_platform)
|
||||
DB_USER - Database user (default: trading_user)
|
||||
DB_PASSWORD - Database password
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import argparse
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
import asyncpg
|
||||
import aiohttp
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[logging.StreamHandler()]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Load environment
|
||||
load_dotenv(Path(__file__).parent.parent / ".env")
|
||||
|
||||
# Configuration
|
||||
POLYGON_API_KEY = os.getenv("POLYGON_API_KEY")
|
||||
POLYGON_BASE_URL = os.getenv("POLYGON_BASE_URL", "https://api.polygon.io")
|
||||
RATE_LIMIT = int(os.getenv("POLYGON_RATE_LIMIT", "5"))
|
||||
|
||||
DB_HOST = os.getenv("DB_HOST", "localhost")
|
||||
DB_PORT = int(os.getenv("DB_PORT", "5432"))
|
||||
DB_NAME = os.getenv("DB_NAME", "trading_platform")
|
||||
DB_USER = os.getenv("DB_USER", "trading_user")
|
||||
DB_PASSWORD = os.getenv("DB_PASSWORD", "trading_dev_2026")
|
||||
|
||||
|
||||
class PolygonFetcher:
|
||||
"""Simple Polygon API fetcher with rate limiting."""
|
||||
|
||||
def __init__(self, api_key: str):
|
||||
self.api_key = api_key
|
||||
self.base_url = POLYGON_BASE_URL
|
||||
self.request_count = 0
|
||||
self.last_request_time = datetime.min
|
||||
|
||||
async def fetch_aggregates(
|
||||
self,
|
||||
ticker: str,
|
||||
multiplier: int,
|
||||
timespan: str,
|
||||
start_date: datetime,
|
||||
end_date: datetime,
|
||||
session: aiohttp.ClientSession
|
||||
) -> list:
|
||||
"""Fetch aggregate bars from Polygon API."""
|
||||
all_results = []
|
||||
|
||||
start_str = start_date.strftime("%Y-%m-%d")
|
||||
end_str = end_date.strftime("%Y-%m-%d")
|
||||
|
||||
url = f"{self.base_url}/v2/aggs/ticker/{ticker}/range/{multiplier}/{timespan}/{start_str}/{end_str}"
|
||||
params = {
|
||||
"adjusted": "true",
|
||||
"sort": "asc",
|
||||
"limit": 50000,
|
||||
"apiKey": self.api_key
|
||||
}
|
||||
|
||||
while url:
|
||||
# Rate limiting
|
||||
await self._rate_limit()
|
||||
|
||||
async with session.get(url, params=params if "apiKey" not in url else None) as response:
|
||||
if response.status == 429:
|
||||
logger.warning("Rate limited, waiting 60s...")
|
||||
await asyncio.sleep(60)
|
||||
continue
|
||||
|
||||
if response.status != 200:
|
||||
text = await response.text()
|
||||
logger.error(f"API error {response.status}: {text}")
|
||||
break
|
||||
|
||||
data = await response.json()
|
||||
|
||||
results = data.get("results", [])
|
||||
all_results.extend(results)
|
||||
|
||||
if len(results) > 0:
|
||||
logger.info(f" Fetched {len(results)} bars (total: {len(all_results)})")
|
||||
|
||||
# Check for pagination
|
||||
next_url = data.get("next_url")
|
||||
if next_url:
|
||||
url = next_url + f"&apiKey={self.api_key}"
|
||||
params = None
|
||||
else:
|
||||
break
|
||||
|
||||
return all_results
|
||||
|
||||
async def _rate_limit(self):
|
||||
"""Enforce rate limiting."""
|
||||
now = datetime.now()
|
||||
|
||||
# Reset counter after 60 seconds
|
||||
if (now - self.last_request_time).total_seconds() >= 60:
|
||||
self.request_count = 0
|
||||
self.last_request_time = now
|
||||
|
||||
# Wait if limit reached
|
||||
if self.request_count >= RATE_LIMIT:
|
||||
wait_time = 60 - (now - self.last_request_time).total_seconds()
|
||||
if wait_time > 0:
|
||||
logger.info(f"Rate limit reached, waiting {wait_time:.1f}s...")
|
||||
await asyncio.sleep(wait_time)
|
||||
self.request_count = 0
|
||||
self.last_request_time = datetime.now()
|
||||
|
||||
self.request_count += 1
|
||||
|
||||
|
||||
async def get_tickers(pool: asyncpg.Pool) -> list:
|
||||
"""Get active tickers from database."""
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT id, symbol, polygon_ticker, asset_type
|
||||
FROM market_data.tickers
|
||||
WHERE is_active = true
|
||||
ORDER BY id
|
||||
"""
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
async def get_last_timestamp(pool: asyncpg.Pool, ticker_id: int, table: str) -> datetime:
|
||||
"""Get the last timestamp for a ticker."""
|
||||
async with pool.acquire() as conn:
|
||||
row = await conn.fetchrow(
|
||||
f"""
|
||||
SELECT MAX(timestamp) as last_ts
|
||||
FROM market_data.{table}
|
||||
WHERE ticker_id = $1
|
||||
""",
|
||||
ticker_id
|
||||
)
|
||||
last_ts = row["last_ts"] if row and row["last_ts"] else None
|
||||
# Remove timezone info for consistent comparison
|
||||
if last_ts and last_ts.tzinfo:
|
||||
last_ts = last_ts.replace(tzinfo=None)
|
||||
return last_ts
|
||||
|
||||
|
||||
async def insert_bars(pool: asyncpg.Pool, ticker_id: int, bars: list, table: str) -> int:
|
||||
"""Insert OHLCV bars into database."""
|
||||
if not bars:
|
||||
return 0
|
||||
|
||||
# Convert bars to tuples
|
||||
data = []
|
||||
for bar in bars:
|
||||
ts = datetime.fromtimestamp(bar["t"] / 1000)
|
||||
data.append((
|
||||
ticker_id,
|
||||
ts,
|
||||
float(bar["o"]),
|
||||
float(bar["h"]),
|
||||
float(bar["l"]),
|
||||
float(bar["c"]),
|
||||
float(bar.get("v", 0)),
|
||||
float(bar.get("vw")) if bar.get("vw") else None,
|
||||
int(ts.timestamp())
|
||||
))
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
await conn.executemany(
|
||||
f"""
|
||||
INSERT INTO market_data.{table}
|
||||
(ticker_id, timestamp, open, high, low, close, volume, vwap, ts_epoch)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
||||
ON CONFLICT (ticker_id, timestamp) DO UPDATE SET
|
||||
open = EXCLUDED.open,
|
||||
high = EXCLUDED.high,
|
||||
low = EXCLUDED.low,
|
||||
close = EXCLUDED.close,
|
||||
volume = EXCLUDED.volume,
|
||||
vwap = EXCLUDED.vwap
|
||||
""",
|
||||
data
|
||||
)
|
||||
|
||||
return len(data)
|
||||
|
||||
|
||||
async def sync_ticker(
|
||||
fetcher: PolygonFetcher,
|
||||
pool: asyncpg.Pool,
|
||||
ticker: dict,
|
||||
backfill_days: int,
|
||||
session: aiohttp.ClientSession
|
||||
) -> dict:
|
||||
"""Sync a single ticker."""
|
||||
symbol = ticker["symbol"]
|
||||
polygon_ticker = ticker["polygon_ticker"]
|
||||
ticker_id = ticker["id"]
|
||||
|
||||
logger.info(f"\n{'='*50}")
|
||||
logger.info(f"Syncing {symbol} ({polygon_ticker})")
|
||||
logger.info(f"{'='*50}")
|
||||
|
||||
# Determine time range
|
||||
last_ts = await get_last_timestamp(pool, ticker_id, "ohlcv_5m")
|
||||
|
||||
if last_ts:
|
||||
start_date = last_ts + timedelta(minutes=5)
|
||||
logger.info(f"Continuing from: {start_date}")
|
||||
else:
|
||||
start_date = datetime.now() - timedelta(days=backfill_days)
|
||||
logger.info(f"Backfilling {backfill_days} days from: {start_date}")
|
||||
|
||||
end_date = datetime.now()
|
||||
|
||||
if start_date >= end_date:
|
||||
logger.info(f"Already up to date")
|
||||
return {"symbol": symbol, "status": "up_to_date", "rows": 0}
|
||||
|
||||
# Fetch data
|
||||
try:
|
||||
bars = await fetcher.fetch_aggregates(
|
||||
ticker=polygon_ticker,
|
||||
multiplier=5,
|
||||
timespan="minute",
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
session=session
|
||||
)
|
||||
|
||||
if not bars:
|
||||
logger.info(f"No new data available")
|
||||
return {"symbol": symbol, "status": "no_data", "rows": 0}
|
||||
|
||||
# Insert into database
|
||||
inserted = await insert_bars(pool, ticker_id, bars, "ohlcv_5m")
|
||||
logger.info(f"Inserted {inserted} rows for {symbol}")
|
||||
|
||||
return {"symbol": symbol, "status": "success", "rows": inserted}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error syncing {symbol}: {e}")
|
||||
return {"symbol": symbol, "status": "error", "error": str(e), "rows": 0}
|
||||
|
||||
|
||||
async def main(backfill_days: int = 30, symbols: list = None):
|
||||
"""Main entry point."""
|
||||
logger.info("="*60)
|
||||
logger.info("Polygon Data Fetcher - Trading Platform")
|
||||
logger.info("="*60)
|
||||
|
||||
if not POLYGON_API_KEY:
|
||||
logger.error("POLYGON_API_KEY is required!")
|
||||
return
|
||||
|
||||
logger.info(f"Polygon API Key: {POLYGON_API_KEY[:10]}...")
|
||||
logger.info(f"Database: {DB_USER}@{DB_HOST}:{DB_PORT}/{DB_NAME}")
|
||||
logger.info(f"Backfill days: {backfill_days}")
|
||||
|
||||
# Connect to database
|
||||
logger.info("\nConnecting to database...")
|
||||
pool = await asyncpg.create_pool(
|
||||
host=DB_HOST,
|
||||
port=DB_PORT,
|
||||
database=DB_NAME,
|
||||
user=DB_USER,
|
||||
password=DB_PASSWORD,
|
||||
min_size=2,
|
||||
max_size=5
|
||||
)
|
||||
|
||||
# Get tickers
|
||||
tickers = await get_tickers(pool)
|
||||
logger.info(f"Found {len(tickers)} active tickers")
|
||||
|
||||
if symbols:
|
||||
tickers = [t for t in tickers if t["symbol"] in symbols]
|
||||
logger.info(f"Filtered to {len(tickers)} tickers: {[t['symbol'] for t in tickers]}")
|
||||
|
||||
# Create fetcher
|
||||
fetcher = PolygonFetcher(POLYGON_API_KEY)
|
||||
|
||||
# Sync all tickers
|
||||
results = []
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for ticker in tickers:
|
||||
result = await sync_ticker(
|
||||
fetcher=fetcher,
|
||||
pool=pool,
|
||||
ticker=ticker,
|
||||
backfill_days=backfill_days,
|
||||
session=session
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
# Small delay between tickers
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# Print summary
|
||||
logger.info("\n" + "="*60)
|
||||
logger.info("SYNC SUMMARY")
|
||||
logger.info("="*60)
|
||||
|
||||
total_rows = 0
|
||||
for r in results:
|
||||
status_icon = "✓" if r["status"] == "success" else "○" if r["status"] == "up_to_date" else "✗"
|
||||
logger.info(f" {status_icon} {r['symbol']}: {r['status']} ({r['rows']} rows)")
|
||||
total_rows += r["rows"]
|
||||
|
||||
logger.info(f"\nTotal rows inserted: {total_rows}")
|
||||
|
||||
# Close pool
|
||||
await pool.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Fetch Polygon data")
|
||||
parser.add_argument("--days", type=int, default=30, help="Days to backfill")
|
||||
parser.add_argument("--symbols", nargs="+", help="Specific symbols to sync")
|
||||
args = parser.parse_args()
|
||||
|
||||
asyncio.run(main(backfill_days=args.days, symbols=args.symbols))
|
||||
Loading…
Reference in New Issue
Block a user