#!/usr/bin/env python3 """ Download BTCUSD data from Polygon API and insert into MySQL database. Updates outdated BTCUSD data (2015-2017) with current data (2020-2025). """ import asyncio import os import sys from datetime import datetime, timedelta from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) import pandas as pd import pymysql from loguru import logger # Configure logging logger.remove() logger.add(sys.stdout, level="INFO", format="{time:HH:mm:ss} | {level} | {message}") # Polygon API configuration POLYGON_API_KEY = "f09bA2V7OG7bHn4HxIT6Xs45ujg_pRXk" POLYGON_BASE_URL = "https://api.polygon.io" # MySQL configuration (from config/database.yaml) MYSQL_CONFIG = { "host": "72.60.226.4", "port": 3306, "user": "root", "password": "AfcItz2391,.", "database": "db_trading_meta" } async def fetch_polygon_data( symbol: str, start_date: datetime, end_date: datetime, timeframe_multiplier: int = 5, timeframe_span: str = "minute" ) -> list: """ Fetch OHLCV data from Polygon API. Args: symbol: Symbol with prefix (e.g., 'X:BTCUSD') start_date: Start date end_date: End date timeframe_multiplier: Timeframe multiplier (5 for 5-minute) timeframe_span: Timeframe span ('minute', 'hour', 'day') Returns: List of OHLCV bars """ import aiohttp all_bars = [] current_start = start_date # Polygon limits to ~50k results per request, so we chunk by month while current_start < end_date: chunk_end = min(current_start + timedelta(days=30), end_date) start_str = current_start.strftime("%Y-%m-%d") end_str = chunk_end.strftime("%Y-%m-%d") endpoint = f"{POLYGON_BASE_URL}/v2/aggs/ticker/{symbol}/range/{timeframe_multiplier}/{timeframe_span}/{start_str}/{end_str}" params = { "apiKey": POLYGON_API_KEY, "adjusted": "true", "sort": "asc", "limit": 50000 } async with aiohttp.ClientSession() as session: try: async with session.get(endpoint, params=params) as response: if response.status == 429: logger.warning("Rate limited, waiting 60s...") await asyncio.sleep(60) continue if response.status != 200: text = await response.text() logger.error(f"API error {response.status}: {text}") current_start = chunk_end continue data = await response.json() results = data.get("results", []) if results: all_bars.extend(results) logger.info(f" Fetched {len(results)} bars for {start_str} to {end_str}") else: logger.warning(f" No data for {start_str} to {end_str}") except Exception as e: logger.error(f"Request failed: {e}") current_start = chunk_end await asyncio.sleep(0.5) # Rate limit: ~2 requests per second return all_bars def insert_to_mysql(bars: list, ticker: str): """ Insert OHLCV bars into MySQL database. Args: bars: List of Polygon API bar objects ticker: Ticker symbol (e.g., 'X:BTCUSD') """ if not bars: logger.warning("No bars to insert") return 0 conn = pymysql.connect(**MYSQL_CONFIG) cursor = conn.cursor() try: # Prepare data insert_query = """ INSERT INTO tickers_agg_data (ticker, date_agg, open, high, low, close, volume, vwap, ts, periodint) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE open = VALUES(open), high = VALUES(high), low = VALUES(low), close = VALUES(close), volume = VALUES(volume), vwap = VALUES(vwap), ts = VALUES(ts) """ # Convert bars to tuples rows = [] for bar in bars: timestamp = datetime.fromtimestamp(bar["t"] / 1000) ts_epoch = bar["t"] # milliseconds rows.append(( ticker, timestamp, bar["o"], # open bar["h"], # high bar["l"], # low bar["c"], # close bar.get("v", 0), # volume bar.get("vw") or 0, # vwap (can't be NULL) ts_epoch, # timestamp in milliseconds 5, # periodint (5-minute bars) )) # Insert in batches batch_size = 5000 total_inserted = 0 for i in range(0, len(rows), batch_size): batch = rows[i:i+batch_size] cursor.executemany(insert_query, batch) conn.commit() total_inserted += len(batch) logger.info(f" Inserted batch {i//batch_size + 1}: {len(batch)} rows (total: {total_inserted})") return total_inserted except Exception as e: logger.error(f"Insert failed: {e}") conn.rollback() raise finally: cursor.close() conn.close() def get_existing_data_range(ticker: str) -> tuple: """ Get existing data range for ticker. Returns: Tuple of (min_date, max_date, count) """ conn = pymysql.connect(**MYSQL_CONFIG) cursor = conn.cursor() try: cursor.execute(""" SELECT MIN(date_agg), MAX(date_agg), COUNT(*) FROM tickers_agg_data WHERE ticker = %s """, (ticker,)) row = cursor.fetchone() return row finally: cursor.close() conn.close() async def main(): """Main function to download and insert BTCUSD data.""" ticker = "X:BTCUSD" logger.info("=" * 60) logger.info("BTCUSD Data Download from Polygon API") logger.info("=" * 60) # Check existing data min_date, max_date, count = get_existing_data_range(ticker) logger.info(f"\nExisting data for {ticker}:") logger.info(f" Range: {min_date} to {max_date}") logger.info(f" Records: {count:,}") # Define download range (2020-2025) start_date = datetime(2020, 1, 1) end_date = datetime(2025, 12, 31) logger.info(f"\nDownloading new data:") logger.info(f" Range: {start_date.date()} to {end_date.date()}") logger.info(f" Timeframe: 5-minute bars") # Fetch data logger.info("\n[1/2] Fetching data from Polygon API...") bars = await fetch_polygon_data( symbol=ticker, start_date=start_date, end_date=end_date, timeframe_multiplier=5, timeframe_span="minute" ) logger.info(f"\nTotal bars fetched: {len(bars):,}") if not bars: logger.error("No data fetched. Check API key and permissions.") return # Show sample if bars: first_bar = bars[0] last_bar = bars[-1] first_ts = datetime.fromtimestamp(first_bar["t"] / 1000) last_ts = datetime.fromtimestamp(last_bar["t"] / 1000) logger.info(f" First bar: {first_ts}") logger.info(f" Last bar: {last_ts}") # Insert to MySQL logger.info("\n[2/2] Inserting data into MySQL...") inserted = insert_to_mysql(bars, ticker) logger.info(f"\nTotal rows inserted/updated: {inserted:,}") # Verify new data range min_date, max_date, count = get_existing_data_range(ticker) logger.info(f"\nUpdated data for {ticker}:") logger.info(f" Range: {min_date} to {max_date}") logger.info(f" Records: {count:,}") logger.info("\n" + "=" * 60) logger.info("Download complete!") logger.info("=" * 60) if __name__ == "__main__": asyncio.run(main())