Hierarchical ML Pipeline for trading predictions:
- Level 0: Attention Models (volatility/flow classification)
- Level 1: Base Models (XGBoost per symbol/timeframe)
- Level 2: Metamodels (XGBoost Stacking + Neural Gating)
Key components:
- src/pipelines/hierarchical_pipeline.py - Main prediction pipeline
- src/models/ - All ML model classes
- src/training/ - Training utilities
- src/api/ - FastAPI endpoints
- scripts/ - Training and evaluation scripts
- config/ - YAML configurations
Note: Trained models (*.joblib, *.pt) are gitignored.
Regenerate with training scripts.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
273 lines
7.7 KiB
Python
273 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Download BTCUSD data from Polygon API and insert into MySQL database.
|
|
Updates outdated BTCUSD data (2015-2017) with current data (2020-2025).
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
# Add src to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
import pandas as pd
|
|
import pymysql
|
|
from loguru import logger
|
|
|
|
# Configure logging
|
|
logger.remove()
|
|
logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}")
|
|
|
|
|
|
# Polygon API configuration
|
|
POLYGON_API_KEY = "f09bA2V7OG7bHn4HxIT6Xs45ujg_pRXk"
|
|
POLYGON_BASE_URL = "https://api.polygon.io"
|
|
|
|
# MySQL configuration (from config/database.yaml)
|
|
MYSQL_CONFIG = {
|
|
"host": "72.60.226.4",
|
|
"port": 3306,
|
|
"user": "root",
|
|
"password": "AfcItz2391,.",
|
|
"database": "db_trading_meta"
|
|
}
|
|
|
|
|
|
async def fetch_polygon_data(
|
|
symbol: str,
|
|
start_date: datetime,
|
|
end_date: datetime,
|
|
timeframe_multiplier: int = 5,
|
|
timeframe_span: str = "minute"
|
|
) -> list:
|
|
"""
|
|
Fetch OHLCV data from Polygon API.
|
|
|
|
Args:
|
|
symbol: Symbol with prefix (e.g., 'X:BTCUSD')
|
|
start_date: Start date
|
|
end_date: End date
|
|
timeframe_multiplier: Timeframe multiplier (5 for 5-minute)
|
|
timeframe_span: Timeframe span ('minute', 'hour', 'day')
|
|
|
|
Returns:
|
|
List of OHLCV bars
|
|
"""
|
|
import aiohttp
|
|
|
|
all_bars = []
|
|
current_start = start_date
|
|
|
|
# Polygon limits to ~50k results per request, so we chunk by month
|
|
while current_start < end_date:
|
|
chunk_end = min(current_start + timedelta(days=30), end_date)
|
|
|
|
start_str = current_start.strftime("%Y-%m-%d")
|
|
end_str = chunk_end.strftime("%Y-%m-%d")
|
|
|
|
endpoint = f"{POLYGON_BASE_URL}/v2/aggs/ticker/{symbol}/range/{timeframe_multiplier}/{timeframe_span}/{start_str}/{end_str}"
|
|
|
|
params = {
|
|
"apiKey": POLYGON_API_KEY,
|
|
"adjusted": "true",
|
|
"sort": "asc",
|
|
"limit": 50000
|
|
}
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
try:
|
|
async with session.get(endpoint, params=params) as response:
|
|
if response.status == 429:
|
|
logger.warning("Rate limited, waiting 60s...")
|
|
await asyncio.sleep(60)
|
|
continue
|
|
|
|
if response.status != 200:
|
|
text = await response.text()
|
|
logger.error(f"API error {response.status}: {text}")
|
|
current_start = chunk_end
|
|
continue
|
|
|
|
data = await response.json()
|
|
results = data.get("results", [])
|
|
|
|
if results:
|
|
all_bars.extend(results)
|
|
logger.info(f" Fetched {len(results)} bars for {start_str} to {end_str}")
|
|
else:
|
|
logger.warning(f" No data for {start_str} to {end_str}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Request failed: {e}")
|
|
|
|
current_start = chunk_end
|
|
await asyncio.sleep(0.5) # Rate limit: ~2 requests per second
|
|
|
|
return all_bars
|
|
|
|
|
|
def insert_to_mysql(bars: list, ticker: str):
|
|
"""
|
|
Insert OHLCV bars into MySQL database.
|
|
|
|
Args:
|
|
bars: List of Polygon API bar objects
|
|
ticker: Ticker symbol (e.g., 'X:BTCUSD')
|
|
"""
|
|
if not bars:
|
|
logger.warning("No bars to insert")
|
|
return 0
|
|
|
|
conn = pymysql.connect(**MYSQL_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
# Prepare data
|
|
insert_query = """
|
|
INSERT INTO tickers_agg_data (ticker, date_agg, open, high, low, close, volume, vwap, ts, periodint)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON DUPLICATE KEY UPDATE
|
|
open = VALUES(open),
|
|
high = VALUES(high),
|
|
low = VALUES(low),
|
|
close = VALUES(close),
|
|
volume = VALUES(volume),
|
|
vwap = VALUES(vwap),
|
|
ts = VALUES(ts)
|
|
"""
|
|
|
|
# Convert bars to tuples
|
|
rows = []
|
|
for bar in bars:
|
|
timestamp = datetime.fromtimestamp(bar["t"] / 1000)
|
|
ts_epoch = bar["t"] # milliseconds
|
|
rows.append((
|
|
ticker,
|
|
timestamp,
|
|
bar["o"], # open
|
|
bar["h"], # high
|
|
bar["l"], # low
|
|
bar["c"], # close
|
|
bar.get("v", 0), # volume
|
|
bar.get("vw") or 0, # vwap (can't be NULL)
|
|
ts_epoch, # timestamp in milliseconds
|
|
5, # periodint (5-minute bars)
|
|
))
|
|
|
|
# Insert in batches
|
|
batch_size = 5000
|
|
total_inserted = 0
|
|
|
|
for i in range(0, len(rows), batch_size):
|
|
batch = rows[i:i+batch_size]
|
|
cursor.executemany(insert_query, batch)
|
|
conn.commit()
|
|
total_inserted += len(batch)
|
|
logger.info(f" Inserted batch {i//batch_size + 1}: {len(batch)} rows (total: {total_inserted})")
|
|
|
|
return total_inserted
|
|
|
|
except Exception as e:
|
|
logger.error(f"Insert failed: {e}")
|
|
conn.rollback()
|
|
raise
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
|
|
def get_existing_data_range(ticker: str) -> tuple:
|
|
"""
|
|
Get existing data range for ticker.
|
|
|
|
Returns:
|
|
Tuple of (min_date, max_date, count)
|
|
"""
|
|
conn = pymysql.connect(**MYSQL_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
cursor.execute("""
|
|
SELECT MIN(date_agg), MAX(date_agg), COUNT(*)
|
|
FROM tickers_agg_data
|
|
WHERE ticker = %s
|
|
""", (ticker,))
|
|
|
|
row = cursor.fetchone()
|
|
return row
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
|
|
async def main():
|
|
"""Main function to download and insert BTCUSD data."""
|
|
ticker = "X:BTCUSD"
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("BTCUSD Data Download from Polygon API")
|
|
logger.info("=" * 60)
|
|
|
|
# Check existing data
|
|
min_date, max_date, count = get_existing_data_range(ticker)
|
|
logger.info(f"\nExisting data for {ticker}:")
|
|
logger.info(f" Range: {min_date} to {max_date}")
|
|
logger.info(f" Records: {count:,}")
|
|
|
|
# Define download range (2020-2025)
|
|
start_date = datetime(2020, 1, 1)
|
|
end_date = datetime(2025, 12, 31)
|
|
|
|
logger.info(f"\nDownloading new data:")
|
|
logger.info(f" Range: {start_date.date()} to {end_date.date()}")
|
|
logger.info(f" Timeframe: 5-minute bars")
|
|
|
|
# Fetch data
|
|
logger.info("\n[1/2] Fetching data from Polygon API...")
|
|
bars = await fetch_polygon_data(
|
|
symbol=ticker,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
timeframe_multiplier=5,
|
|
timeframe_span="minute"
|
|
)
|
|
|
|
logger.info(f"\nTotal bars fetched: {len(bars):,}")
|
|
|
|
if not bars:
|
|
logger.error("No data fetched. Check API key and permissions.")
|
|
return
|
|
|
|
# Show sample
|
|
if bars:
|
|
first_bar = bars[0]
|
|
last_bar = bars[-1]
|
|
first_ts = datetime.fromtimestamp(first_bar["t"] / 1000)
|
|
last_ts = datetime.fromtimestamp(last_bar["t"] / 1000)
|
|
logger.info(f" First bar: {first_ts}")
|
|
logger.info(f" Last bar: {last_ts}")
|
|
|
|
# Insert to MySQL
|
|
logger.info("\n[2/2] Inserting data into MySQL...")
|
|
inserted = insert_to_mysql(bars, ticker)
|
|
|
|
logger.info(f"\nTotal rows inserted/updated: {inserted:,}")
|
|
|
|
# Verify new data range
|
|
min_date, max_date, count = get_existing_data_range(ticker)
|
|
logger.info(f"\nUpdated data for {ticker}:")
|
|
logger.info(f" Range: {min_date} to {max_date}")
|
|
logger.info(f" Records: {count:,}")
|
|
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("Download complete!")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|