trading-platform-ml-engine-v2/scripts/download_btcusd_polygon.py
rckrdmrd 75c4d07690 feat: Initial commit - ML Engine codebase
Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)

Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations

Note: Trained models (*.joblib, *.pt) are gitignored.
      Regenerate with training scripts.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 04:27:40 -06:00

273 lines
7.7 KiB
Python

#!/usr/bin/env python3
"""
Download BTCUSD data from Polygon API and insert into MySQL database.
Updates outdated BTCUSD data (2015-2017) with current data (2020-2025).
"""
import asyncio
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
import pandas as pd
import pymysql
from loguru import logger
# Configure logging
logger.remove()
logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
# Polygon API configuration
POLYGON_API_KEY = "f09bA2V7OG7bHn4HxIT6Xs45ujg_pRXk"
POLYGON_BASE_URL = "https://api.polygon.io"
# MySQL configuration (from config/database.yaml)
MYSQL_CONFIG = {
"host": "72.60.226.4",
"port": 3306,
"user": "root",
"password": "AfcItz2391,.",
"database": "db_trading_meta"
}
async def fetch_polygon_data(
symbol: str,
start_date: datetime,
end_date: datetime,
timeframe_multiplier: int = 5,
timeframe_span: str = "minute"
) -> list:
"""
Fetch OHLCV data from Polygon API.
Args:
symbol: Symbol with prefix (e.g., 'X:BTCUSD')
start_date: Start date
end_date: End date
timeframe_multiplier: Timeframe multiplier (5 for 5-minute)
timeframe_span: Timeframe span ('minute', 'hour', 'day')
Returns:
List of OHLCV bars
"""
import aiohttp
all_bars = []
current_start = start_date
# Polygon limits to ~50k results per request, so we chunk by month
while current_start < end_date:
chunk_end = min(current_start + timedelta(days=30), end_date)
start_str = current_start.strftime("%Y-%m-%d")
end_str = chunk_end.strftime("%Y-%m-%d")
endpoint = f"{POLYGON_BASE_URL}/v2/aggs/ticker/{symbol}/range/{timeframe_multiplier}/{timeframe_span}/{start_str}/{end_str}"
params = {
"apiKey": POLYGON_API_KEY,
"adjusted": "true",
"sort": "asc",
"limit": 50000
}
async with aiohttp.ClientSession() as session:
try:
async with session.get(endpoint, params=params) as response:
if response.status == 429:
logger.warning("Rate limited, waiting 60s...")
await asyncio.sleep(60)
continue
if response.status != 200:
text = await response.text()
logger.error(f"API error {response.status}: {text}")
current_start = chunk_end
continue
data = await response.json()
results = data.get("results", [])
if results:
all_bars.extend(results)
logger.info(f" Fetched {len(results)} bars for {start_str} to {end_str}")
else:
logger.warning(f" No data for {start_str} to {end_str}")
except Exception as e:
logger.error(f"Request failed: {e}")
current_start = chunk_end
await asyncio.sleep(0.5) # Rate limit: ~2 requests per second
return all_bars
def insert_to_mysql(bars: list, ticker: str):
"""
Insert OHLCV bars into MySQL database.
Args:
bars: List of Polygon API bar objects
ticker: Ticker symbol (e.g., 'X:BTCUSD')
"""
if not bars:
logger.warning("No bars to insert")
return 0
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
try:
# Prepare data
insert_query = """
INSERT INTO tickers_agg_data (ticker, date_agg, open, high, low, close, volume, vwap, ts, periodint)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
open = VALUES(open),
high = VALUES(high),
low = VALUES(low),
close = VALUES(close),
volume = VALUES(volume),
vwap = VALUES(vwap),
ts = VALUES(ts)
"""
# Convert bars to tuples
rows = []
for bar in bars:
timestamp = datetime.fromtimestamp(bar["t"] / 1000)
ts_epoch = bar["t"] # milliseconds
rows.append((
ticker,
timestamp,
bar["o"], # open
bar["h"], # high
bar["l"], # low
bar["c"], # close
bar.get("v", 0), # volume
bar.get("vw") or 0, # vwap (can't be NULL)
ts_epoch, # timestamp in milliseconds
5, # periodint (5-minute bars)
))
# Insert in batches
batch_size = 5000
total_inserted = 0
for i in range(0, len(rows), batch_size):
batch = rows[i:i+batch_size]
cursor.executemany(insert_query, batch)
conn.commit()
total_inserted += len(batch)
logger.info(f" Inserted batch {i//batch_size + 1}: {len(batch)} rows (total: {total_inserted})")
return total_inserted
except Exception as e:
logger.error(f"Insert failed: {e}")
conn.rollback()
raise
finally:
cursor.close()
conn.close()
def get_existing_data_range(ticker: str) -> tuple:
"""
Get existing data range for ticker.
Returns:
Tuple of (min_date, max_date, count)
"""
conn = pymysql.connect(**MYSQL_CONFIG)
cursor = conn.cursor()
try:
cursor.execute("""
SELECT MIN(date_agg), MAX(date_agg), COUNT(*)
FROM tickers_agg_data
WHERE ticker = %s
""", (ticker,))
row = cursor.fetchone()
return row
finally:
cursor.close()
conn.close()
async def main():
"""Main function to download and insert BTCUSD data."""
ticker = "X:BTCUSD"
logger.info("=" * 60)
logger.info("BTCUSD Data Download from Polygon API")
logger.info("=" * 60)
# Check existing data
min_date, max_date, count = get_existing_data_range(ticker)
logger.info(f"\nExisting data for {ticker}:")
logger.info(f" Range: {min_date} to {max_date}")
logger.info(f" Records: {count:,}")
# Define download range (2020-2025)
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 12, 31)
logger.info(f"\nDownloading new data:")
logger.info(f" Range: {start_date.date()} to {end_date.date()}")
logger.info(f" Timeframe: 5-minute bars")
# Fetch data
logger.info("\n[1/2] Fetching data from Polygon API...")
bars = await fetch_polygon_data(
symbol=ticker,
start_date=start_date,
end_date=end_date,
timeframe_multiplier=5,
timeframe_span="minute"
)
logger.info(f"\nTotal bars fetched: {len(bars):,}")
if not bars:
logger.error("No data fetched. Check API key and permissions.")
return
# Show sample
if bars:
first_bar = bars[0]
last_bar = bars[-1]
first_ts = datetime.fromtimestamp(first_bar["t"] / 1000)
last_ts = datetime.fromtimestamp(last_bar["t"] / 1000)
logger.info(f" First bar: {first_ts}")
logger.info(f" Last bar: {last_ts}")
# Insert to MySQL
logger.info("\n[2/2] Inserting data into MySQL...")
inserted = insert_to_mysql(bars, ticker)
logger.info(f"\nTotal rows inserted/updated: {inserted:,}")
# Verify new data range
min_date, max_date, count = get_existing_data_range(ticker)
logger.info(f"\nUpdated data for {ticker}:")
logger.info(f" Range: {min_date} to {max_date}")
logger.info(f" Records: {count:,}")
logger.info("\n" + "=" * 60)
logger.info("Download complete!")
logger.info("=" * 60)
if __name__ == "__main__":
asyncio.run(main())