pairs_trading/src/tools/data_loader.py
2025-05-29 02:47:38 -04:00

126 lines
3.7 KiB
Python

import sys
import sqlite3
from typing import Dict, Tuple
import pandas as pd
from tools.trading_pair import TradingPair
def load_sqlite_to_dataframe(db_path, query):
try:
conn = sqlite3.connect(db_path)
df = pd.read_sql_query(query, conn)
return df
except sqlite3.Error as excpt:
print(f"SQLite error: {excpt}")
raise
except Exception as e:
print(f"Error: {excpt}")
raise
finally:
if "conn" in locals():
conn.close()
def convert_time_to_UTC(value: str, timezone: str):
from zoneinfo import ZoneInfo
from datetime import datetime
# Parse it to naive datetime object
local_dt = datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
zinfo = ZoneInfo(timezone)
result = local_dt.replace(tzinfo=zinfo)
result = result.astimezone(ZoneInfo("UTC"))
result = result.strftime("%Y-%m-%d %H:%M:%S")
return result
def load_market_data(datafile: str, config: Dict) -> pd.DataFrame:
from tools.data_loader import load_sqlite_to_dataframe
instrument_ids = [
'"' + config["instrument_id_pfx"] + instrument + '"'
for instrument in config["instruments"]
]
security_type = config["security_type"]
exchange_id = config["exchange_id"]
query = "select"
if security_type == "CRYPTO":
query += " strftime('%Y-%m-%d %H:%M:%S', tstamp/1000000000, 'unixepoch') as tstamp"
query += ", tstamp as time_ns"
else:
query += " tstamp"
query += ", tstamp_ns as time_ns"
query += f", substr(instrument_id, {len(config['instrument_id_pfx']) + 1}) as symbol"
query += ", open"
query += ", high"
query += ", low"
query += ", close"
query += ", volume"
query += ", num_trades"
query += ", vwap"
query += f" from {config['db_table_name']}"
query += f" where exchange_id ='{exchange_id}'"
query += f" and instrument_id in ({','.join(instrument_ids)})"
df = load_sqlite_to_dataframe(db_path=datafile, query=query)
# Trading Hours
date_str = df["tstamp"][0][0:10]
trading_hours = config["trading_hours"]
start_time = convert_time_to_UTC(
f"{date_str} {trading_hours['begin_session']}", trading_hours["timezone"]
)
end_time = convert_time_to_UTC(
f"{date_str} {trading_hours['end_session']}", trading_hours["timezone"]
)
# Perform boolean selection
df = df[(df["tstamp"] >= start_time) & (df["tstamp"] <= end_time)]
df["tstamp"] = pd.to_datetime(df["tstamp"])
return df
def transform_dataframe(df: pd.DataFrame, price_column: str):
# Select only the columns we need
df_selected = df[["tstamp", "symbol", price_column]]
# Start with unique timestamps
result_df: pd.DataFrame = pd.DataFrame(df_selected["tstamp"]).drop_duplicates().reset_index(drop=True)
# For each unique symbol, add a corresponding close price column
for symbol in df_selected["symbol"].unique():
# Filter rows for this symbol
df_symbol = df_selected[df_selected["symbol"] == symbol].reset_index(drop=True)
# Create column name like "close-COIN"
new_price_column = f"{price_column}_{symbol}"
# Create temporary dataframe with timestamp and price
temp_df = pd.DataFrame({
"tstamp": df_symbol["tstamp"],
new_price_column: df_symbol[price_column]
})
# Join with our result dataframe
result_df = pd.merge(result_df, temp_df, on="tstamp", how="left")
result_df = result_df.reset_index(drop=True) # do not dropna() since irrelevant symbol would affect dataset
return result_df
# if __name__ == "__main__":
# df1 = load_sqlite_to_dataframe(sys.argv[1], table_name="md_1min_bars")
# print(df1)