informer_model/notebooks/btcusdt_dataset.ipynb
2025-04-13 09:47:33 +02:00

1048 lines
38 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import plotly.graph_objs as go\n",
"from plotly.subplots import make_subplots\n",
"import logging\n",
"import numpy as np\n",
"import talib\n",
"\n",
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and preprocess BTC/USDT data\n",
"Using kline data downloaded from binance with 5, 15 and 30 min interval."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def fill_missing_values(data, expected_interval):\n",
"\n",
" LOW_TIME = pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)\n",
"\n",
" data = data.loc[data['close_time'] >= LOW_TIME]\n",
"\n",
" \n",
" num_new_intervals, num_fixed_intervals = 0, 0\n",
" expected_internal_interval = expected_interval - pd.Timedelta(milliseconds=1)\n",
" prev_row = data.iloc[0]\n",
" preprocessed_data = [prev_row]\n",
" for _, row in data.iloc[1:].iterrows():\n",
" interval = row['open_time'] - prev_row['open_time']\n",
" internal_interval = row['close_time'] - row['open_time']\n",
"\n",
" if internal_interval < expected_internal_interval:\n",
" row['close_time'] = row['open_time'] + expected_internal_interval\n",
" num_fixed_intervals += 1\n",
"\n",
" while interval != expected_interval:\n",
" prev_row = prev_row.copy()\n",
" prev_row['open_time'] = prev_row['open_time'] + expected_interval\n",
" prev_row['close_time'] = prev_row['close_time'] + expected_interval\n",
"\n",
" preprocessed_data.append(prev_row)\n",
" num_new_intervals += 1\n",
" interval = row['open_time'] - prev_row['open_time']\n",
"\n",
" prev_row = row\n",
" preprocessed_data.append(row)\n",
"\n",
" logging.info(\"Inserted %d new intervals.\", num_new_intervals)\n",
" logging.info(\"Fixed %d intervals.\", num_fixed_intervals)\n",
"\n",
" result = pd.DataFrame(preprocessed_data)\n",
" result = result.reset_index(drop=True)\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def load_btc_usdt_dataset(path, interval):\n",
" logging.info(\"=\"*80)\n",
" logging.info(\"Loading dataset '%s' with interval %s\", path, interval)\n",
" btc_usdt_data = pd.read_csv(path,\n",
" usecols=['Open time', 'Open price', 'High price', 'Low price', 'Close price', 'Volume', 'Close time'])\n",
" btc_usdt_data.rename(columns=lambda x: x.lower().replace(\" \", \"_\"), inplace=True)\n",
" btc_usdt_data['open_time'] = pd.to_datetime(btc_usdt_data['open_time'], unit='ms')\n",
" btc_usdt_data['close_time'] = pd.to_datetime(btc_usdt_data['close_time'], unit='ms')\n",
" btc_usdt_data = fill_missing_values(btc_usdt_data, interval)\n",
"\n",
" # Sanity check assertions\n",
" unique_intervals_open_time = (btc_usdt_data.shift(-1).dropna()['open_time'] - btc_usdt_data.iloc[:-1]['open_time']).unique()\n",
" unique_intervals_close_time = (btc_usdt_data.shift(-1).dropna()['close_time'] - btc_usdt_data.iloc[:-1]['close_time']).unique()\n",
" assert len(unique_intervals_open_time) == 1\n",
" assert len(unique_intervals_close_time) == 1\n",
" logging.info(\"There is the same interval between all open times: %s\", unique_intervals_open_time[0])\n",
" logging.info(\"There is the same interval between all close times: %s\", unique_intervals_close_time[0])\n",
"\n",
" # Print statistics\n",
" logging.info(\"First data point: %s\", btc_usdt_data['close_time'].iloc[0])\n",
" logging.info(\"Last data point: %s\", btc_usdt_data['close_time'].iloc[-1])\n",
" logging.info(\"Number of data points: %d\", len(btc_usdt_data))\n",
"\n",
" # Plot data\n",
" go.Figure(go.Scatter(y=btc_usdt_data['close_price'].iloc[::10], x=btc_usdt_data['close_time'].iloc[::10])).show()\n",
"\n",
" return btc_usdt_data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"INTERVAL_1_MIN = pd.Timedelta(minutes=1)\n",
"INTERVAL_5_MIN = pd.Timedelta(minutes=5)\n",
"INTERVAL_15_MIN = pd.Timedelta(minutes=15)\n",
"INTERVAL_30_MIN = pd.Timedelta(minutes=30)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# btc_usdt_1_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-1m.csv', INTERVAL_1_MIN)\n",
"# btc_usdt_1_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# btc_usdt_5_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-5m.csv', INTERVAL_5_MIN)\n",
"# btc_usdt_5_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# btc_usdt_15_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-15m.csv', INTERVAL_15_MIN)\n",
"# btc_usdt_15_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# btc_usdt_30_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-30m.csv', INTERVAL_30_MIN)\n",
"# btc_usdt_30_min_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and preprocess VIX data"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"VIX_COL_NAME = 'vix_close_price'\n",
"\n",
"def preprocess_vix_data(data, vix_data):\n",
" idx = 0\n",
" result = []\n",
" for _, row in data.iterrows():\n",
" while idx + 1 < len(vix_data):\n",
" # Loop until a day from data point, take value from previous day\n",
" if vix_data['date'].iloc[idx + 1].date() >= row['close_time'].date():\n",
" break\n",
" idx +=1\n",
" result.append(vix_data.iloc[idx])\n",
" \n",
" return pd.DataFrame(result).reset_index(drop=True)\n",
"\n",
"\n",
"# VIX daily data https://www.cboe.com/tradable_products/vix/vix_historical_data/\n",
"vix_data = pd.read_csv(\"../data/raw_data/VIX_History.csv\", usecols=[\"DATE\", \"CLOSE\"])\n",
"vix_data.rename(columns=lambda x: x.lower().replace(\" \", \"_\"), inplace=True)\n",
"vix_data['date'] = pd.to_datetime(vix_data['date'])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# vix_1_min_data = preprocess_vix_data(btc_usdt_1_min_data, vix_data)\n",
"# btc_usdt_1_min_data[VIX_COL_NAME] = vix_1_min_data['close']\n",
"# go.Figure(go.Scatter(y=btc_usdt_1_min_data['vix_close_price'].iloc[::10], x=btc_usdt_1_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_1_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# vix_5_min_data = preprocess_vix_data(btc_usdt_5_min_data, vix_data)\n",
"# btc_usdt_5_min_data[VIX_COL_NAME] = vix_5_min_data['close']\n",
"# go.Figure(go.Scatter(y=btc_usdt_5_min_data['vix_close_price'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_5_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# vix_15_min_data = preprocess_vix_data(btc_usdt_15_min_data, vix_data)\n",
"# btc_usdt_15_min_data[VIX_COL_NAME] = vix_15_min_data['close']\n",
"# go.Figure(go.Scatter(y=btc_usdt_15_min_data['vix_close_price'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_15_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# vix_30_min_data = preprocess_vix_data(btc_usdt_30_min_data, vix_data)\n",
"# btc_usdt_30_min_data[VIX_COL_NAME] = vix_30_min_data['close']\n",
"# go.Figure(go.Scatter(y=btc_usdt_30_min_data['vix_close_price'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_30_min_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and preprocess FED data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"FED_COL_NAME='effective_rates'\n",
"\n",
"def preprocess_fed_data(data, fed_data):\n",
" idx = 0\n",
" result = []\n",
" for _, row in data.iterrows():\n",
" while idx + 1 < len(fed_data):\n",
" # Loop until a day from data point, take value from previous day\n",
" if fed_data['date'].iloc[idx + 1].date() >= row['close_time'].date():\n",
" break\n",
" idx +=1\n",
" result.append(fed_data.iloc[idx])\n",
" \n",
" return pd.DataFrame(result).reset_index(drop=True)\n",
"\n",
"\n",
"# FED effective rates https://fred.stlouisfed.org/series/FEDFUNDS\n",
"fed_data = pd.read_csv('../data/raw_data/FEDFUNDS.csv', usecols=[\"DATE\", \"FEDFUNDS\"])\n",
"fed_data.rename(columns=lambda x: x.lower().replace(\" \", \"_\"), inplace=True)\n",
"fed_data['date'] = pd.to_datetime(fed_data['date'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# fed_1_min_data = preprocess_fed_data(btc_usdt_1_min_data, fed_data)\n",
"# btc_usdt_1_min_data[FED_COL_NAME] = fed_1_min_data['fedfunds']\n",
"# go.Figure(go.Scatter(y=btc_usdt_1_min_data['effective_rates'].iloc[::10], x=btc_usdt_1_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_1_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# fed_5_min_data = preprocess_fed_data(btc_usdt_5_min_data, fed_data)\n",
"# btc_usdt_5_min_data[FED_COL_NAME] = fed_5_min_data['fedfunds']\n",
"# go.Figure(go.Scatter(y=btc_usdt_5_min_data['effective_rates'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_5_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# fed_15_min_data = preprocess_fed_data(btc_usdt_15_min_data, fed_data)\n",
"# btc_usdt_15_min_data[FED_COL_NAME] = fed_15_min_data['fedfunds']\n",
"# go.Figure(go.Scatter(y=btc_usdt_15_min_data['effective_rates'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_15_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# fed_30_min_data = preprocess_fed_data(btc_usdt_30_min_data, fed_data)\n",
"# btc_usdt_30_min_data[FED_COL_NAME] = fed_30_min_data['fedfunds']\n",
"# go.Figure(go.Scatter(y=btc_usdt_30_min_data['effective_rates'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_30_min_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and preprocess Crypto fear and greed index"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"FEAR_GREED_COL_NAME = 'fear_greed_index'\n",
"\n",
"def preprocess_feargreed_data(data, feargreed_data):\n",
" idx = 0\n",
" result = []\n",
" for _, row in data.iterrows():\n",
" while idx + 1 < len(feargreed_data):\n",
" # Loop until a day from data point, take value from previous day\n",
" if feargreed_data['timestamp'].iloc[idx + 1].date() >= row['close_time'].date():\n",
" break\n",
" idx +=1\n",
" result.append(feargreed_data.iloc[idx])\n",
" \n",
" return pd.DataFrame(result).reset_index(drop=True)\n",
"\n",
"# Fear/Greed index from https://alternative.me/crypto/fear-and-greed-index/\n",
"# fear_greed_data = pd.DataFrame(requests.get(\"https://api.alternative.me/fng/?limit=0\").json()['data'])\n",
"# fear_greed_data.to_csv('../data/btcusdt_5m/fear_greed.csv', index=False)\n",
"feargreed_data = pd.read_csv(\"../data/raw_data/fear_greed.csv\", usecols=[\"timestamp\", \"value\"])\n",
"feargreed_data['timestamp'] = pd.to_datetime(feargreed_data['timestamp'], unit='s')\n",
"feargreed_data = feargreed_data.sort_values('timestamp').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# feargreed_1_min_data = preprocess_feargreed_data(btc_usdt_1_min_data, feargreed_data)\n",
"# btc_usdt_1_min_data[FEAR_GREED_COL_NAME] = feargreed_1_min_data['value']\n",
"# go.Figure(go.Scatter(y=btc_usdt_1_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_1_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_1_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# feargreed_5_min_data = preprocess_feargreed_data(btc_usdt_5_min_data, feargreed_data)\n",
"# btc_usdt_5_min_data[FEAR_GREED_COL_NAME] = feargreed_5_min_data['value']\n",
"# go.Figure(go.Scatter(y=btc_usdt_5_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_5_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# feargreed_15_min_data = preprocess_feargreed_data(btc_usdt_15_min_data, feargreed_data)\n",
"# btc_usdt_15_min_data[FEAR_GREED_COL_NAME] = feargreed_15_min_data['value']\n",
"# go.Figure(go.Scatter(y=btc_usdt_15_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_15_min_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# feargreed_30_min_data = preprocess_feargreed_data(btc_usdt_30_min_data, feargreed_data)\n",
"# btc_usdt_30_min_data[FEAR_GREED_COL_NAME] = feargreed_30_min_data['value']\n",
"# go.Figure(go.Scatter(y=btc_usdt_30_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()\n",
"# btc_usdt_30_min_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Augment dataset\n",
"Augment dataset with technical indicators and other additional data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_augment_data(data, interval):\n",
" assert 'close_price' in data.columns\n",
" assert 'close_time' in data.columns\n",
"\n",
" result = data.copy()\n",
"\n",
" # Required for pytorch forecasting framework\n",
" result['time_index'] = result.index\n",
" result['group_id'] = 'BTCUSDT'\n",
"\n",
" # Derrived variables\n",
" result['hour'] = result['close_time'].apply(lambda x: x.hour).astype('str')\n",
" result['weekday'] = result['close_time'].apply(lambda x: x.weekday()).astype('str')\n",
" result['open_to_close_price'] = result['open_price'] / result['close_price']\n",
" result['high_to_close_price'] = result['high_price'] / result['close_price']\n",
" result['low_to_close_price'] = result['low_price'] / result['close_price']\n",
" result['high_to_low_price'] = result['high_price'] / result['low_price']\n",
"\n",
" # Resturns\n",
" ret = result['close_price'].to_numpy()\n",
" ret = (ret[1:] / ret[:-1]) - 1\n",
" result['returns'] = np.pad(ret, (1, 0), 'constant', constant_values=(0, 0))\n",
"\n",
" # Returns binary\n",
" result['returns_binary'] = (result['returns'] > 0).astype(np.int32)\n",
"\n",
" # Log returns\n",
" log_ret = result['close_price'].to_numpy()\n",
" log_ret = np.log(log_ret[1:]) - np.log(log_ret[:-1])\n",
" result['log_returns'] = np.pad(log_ret, (1, 0), 'constant', constant_values=(0, 0))\n",
"\n",
" # ::Realized volatility\n",
" cum_sum = result['log_returns'].to_numpy()\n",
" cum_sum *= cum_sum\n",
" cum_sum = np.cumsum(cum_sum)\n",
" \n",
" close_price = result['close_price'].to_numpy()\n",
" # 1h window \n",
" obs_1h = 1 * (60 // interval)\n",
" result['vol_1h'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_1h], (obs_1h, 0), 'constant', constant_values=(0, 0)))\n",
" result['sma_1h_to_close_price'] = (talib.SMA(close_price, obs_1h) / result['close_price']).fillna(0)\n",
" result['ema_1h_to_close_price'] = (talib.EMA(close_price, obs_1h) / result['close_price']).fillna(0)\n",
" # 1d window\n",
" obs_1d = 24 * (60 // interval)\n",
" result['vol_1d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_1d], (obs_1d, 0), 'constant', constant_values=(0, 0)))\n",
" result['sma_1d_to_close_price'] = (talib.SMA(close_price, obs_1d) / result['close_price']).fillna(0)\n",
" result['ema_1d_to_close_price'] = (talib.EMA(close_price, obs_1d) / result['close_price']).fillna(0)\n",
" # 7d window\n",
" obs_7d = 7 * 24 * (60 // interval)\n",
" result['vol_7d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_7d], (obs_7d, 0), 'constant', constant_values=(0, 0)))\n",
" result['sma_7d_to_close_price'] = (talib.SMA(close_price, obs_7d) / result['close_price']).fillna(0)\n",
"\n",
" macd, signal, _ = talib.MACD(close_price, fastperiod=12, slowperiod=26, signalperiod=9)\n",
" result['macd'] = macd\n",
" result['macd'] = result['macd'].fillna(0)\n",
" result['macd_signal'] = signal\n",
" result['macd_signal'] = result['macd_signal'].fillna(0)\n",
"\n",
" rsi = talib.RSI(close_price, timeperiod=14)\n",
" result['rsi'] = rsi\n",
" result['rsi'] = result['rsi'].fillna(0)\n",
"\n",
" upper, middle, lower = talib.BBANDS(close_price, 20, 2.0, 2.0)\n",
" result['low_bband_to_close_price'] = (lower / result['close_price']).fillna(0)\n",
" result['up_bband_to_close_price'] = (upper / result['close_price']).fillna(0)\n",
" result['mid_bband_to_close_price'] = (middle / result['close_price']).fillna(0)\n",
"\n",
" assert len(result.dropna()) == len(result) \n",
" assert len(result['time_index'].unique()) == len(result)\n",
"\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"btc_usdt_1_min_data = preprocess_augment_data(btc_usdt_1_min_data, 1)\n",
"# btc_usdt_5_min_data = preprocess_augment_data(btc_usdt_5_min_data, 5)\n",
"# btc_usdt_15_min_data = preprocess_augment_data(btc_usdt_15_min_data, 15)\n",
"# btc_usdt_30_min_data = preprocess_augment_data(btc_usdt_30_min_data, 30)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save preprocessed data"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"btc_usdt_1_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-1m.csv', index=False)\n",
"# btc_usdt_5_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-5m.csv', index=False)\n",
"# btc_usdt_15_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-15m.csv', index=False)\n",
"# btc_usdt_30_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-30m.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset statistics"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def load_dataset(path):\n",
" dataset = pd.read_csv(path)\n",
" dataset['close_time'] = pd.to_datetime(dataset['close_time'])\n",
" dataset['open_time'] = pd.to_datetime(dataset['open_time'])\n",
" return dataset\n",
"\n",
"def print_dataset_stats(dataset, interval):\n",
" time_span_days = (dataset['close_time'].\n",
" iloc[-1] - dataset['close_time'].iloc[0]).days\n",
" print(f\"---- DATASET BTC-USDT {interval} m ----\")\n",
" print(f\"Num observations: {len(dataset)}\")\n",
" print(f\"First observation: {dataset['close_time'].iloc[0]}\")\n",
" print(f\"Last observation: {dataset['close_time'].iloc[-1]}\")\n",
" print(f\"Time span: {time_span_days} days ({time_span_days / 365:.2} years)\")\n",
" print(f\"Variables: {dataset.columns}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---- DATASET BTC-USDT 1 m ----\n",
"Num observations: 2941560\n",
"First observation: 2019-07-21 00:00:59.999000\n",
"Last observation: 2025-02-21 17:59:59.999000\n",
"Time span: 2042 days (5.6 years)\n",
"Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n",
" 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n",
" 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n",
" 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n",
" 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n",
" 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n",
" 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n",
" 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n",
" 'low_bband_to_close_price', 'up_bband_to_close_price',\n",
" 'mid_bband_to_close_price'],\n",
" dtype='object')\n"
]
}
],
"source": [
"btc_usdt_1_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-1m.csv')\n",
"print_dataset_stats(btc_usdt_1_min_data, 1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---- DATASET BTC-USDT 5 m ----\n",
"Num observations: 730437\n",
"First observation: 2017-08-17 04:04:59.999000\n",
"Last observation: 2024-07-27 09:44:59.999000\n",
"Time span: 2536 days (6.9 years)\n",
"Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n",
" 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n",
" 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n",
" 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n",
" 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n",
" 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n",
" 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n",
" 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n",
" 'low_bband_to_close_price', 'up_bband_to_close_price',\n",
" 'mid_bband_to_close_price'],\n",
" dtype='object')\n"
]
}
],
"source": [
"btc_usdt_5_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-5m.csv')\n",
"print_dataset_stats(btc_usdt_5_min_data, 5)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---- DATASET BTC-USDT 15 m ----\n",
"Num observations: 248118\n",
"First observation: 2017-08-17 04:14:59.999000\n",
"Last observation: 2024-09-13 17:29:59.999000\n",
"Time span: 2584 days (7.1 years)\n",
"Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n",
" 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n",
" 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n",
" 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n",
" 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n",
" 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n",
" 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n",
" 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n",
" 'low_bband_to_close_price', 'up_bband_to_close_price',\n",
" 'mid_bband_to_close_price'],\n",
" dtype='object')\n"
]
}
],
"source": [
"btc_usdt_15_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-15m.csv')\n",
"print_dataset_stats(btc_usdt_15_min_data, 15)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---- DATASET BTC-USDT 30 m ----\n",
"Num observations: 124059\n",
"First observation: 2017-08-17 04:29:59.999000\n",
"Last observation: 2024-09-13 17:29:59.999000\n",
"Time span: 2584 days (7.1 years)\n",
"Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n",
" 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n",
" 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n",
" 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n",
" 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n",
" 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n",
" 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n",
" 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n",
" 'low_bband_to_close_price', 'up_bband_to_close_price',\n",
" 'mid_bband_to_close_price'],\n",
" dtype='object')\n"
]
}
],
"source": [
"btc_usdt_30_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-30m.csv')\n",
"print_dataset_stats(btc_usdt_30_min_data, 30)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Split dataset into moving window parts"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"NUM_MOVING_WINDOWS = 6\n",
"LIMIT_TIME = pd.Timestamp(year=2024, month=7, day=25, hour=23, minute=59)\n",
"LOW_TIME = pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)\n",
"\n",
"def trim_number_of_observations(data, limit_time):\n",
" return data.loc[data['close_time'] <= limit_time]\n",
"\n",
"def split_dataset_moving_window(\n",
" data, \n",
" num_parts,\n",
" in_sample_size,\n",
" out_of_sample_size):\n",
" result = []\n",
" dataset_len = len(data)\n",
" window_size = in_sample_size + out_of_sample_size\n",
"\n",
" print(f\"In sample size: {in_sample_size}\")\n",
" print(f\"Out of sample size: {out_of_sample_size}\")\n",
" \n",
" for i in range(num_parts):\n",
" part_len = dataset_len - i * out_of_sample_size\n",
" in_sample_part = data.iloc[part_len - window_size: part_len - out_of_sample_size].reset_index(drop=True)\n",
" out_of_sample_part = data.iloc[part_len - out_of_sample_size: part_len].reset_index(drop=True)\n",
" result.append((in_sample_part, out_of_sample_part))\n",
" \n",
" # Return windows in time order\n",
" return list(reversed(result))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def plot_moving_windows(data, windows):\n",
"\n",
" fig = make_subplots(\n",
" rows=NUM_MOVING_WINDOWS + 1,\n",
" cols=1,\n",
" row_heights=[0.05] * NUM_MOVING_WINDOWS + [1 - 0.05 * NUM_MOVING_WINDOWS],\n",
" vertical_spacing=0.05,\n",
" shared_xaxes=True)\n",
"\n",
" for i, (in_sample, out_sample) in enumerate(windows):\n",
" fig.add_trace(go.Bar(\n",
" y = [1, 1],\n",
" x = [in_sample['close_time'].iloc[0], out_sample['close_time'].iloc[0]],\n",
" marker_color=['lightgrey', 'darkgrey'],\n",
" width = [\n",
" (in_sample['close_time'].iloc[-1] - in_sample['close_time'].iloc[0]).total_seconds()*1000,\n",
" (out_sample['close_time'].iloc[-1] - out_sample['close_time'].iloc[0]).total_seconds()*1000\n",
" ],\n",
" offset=0\n",
" ), row=NUM_MOVING_WINDOWS - i, col=1)\n",
"\n",
" fig.add_trace(go.Scatter(\n",
" y=data['close_price'].iloc[::100],\n",
" x=data['close_time'].iloc[::100],\n",
" marker_color='black',\n",
" ), row=NUM_MOVING_WINDOWS+1, col=1)\n",
" fig.update_yaxes(showticklabels=False)\n",
" fig.update_yaxes(showticklabels=True, row=NUM_MOVING_WINDOWS+1, col=1)\n",
" fig.update_xaxes(range=[\n",
" windows[0][0]['close_time'].iloc[0],\n",
" windows[-1][-1]['close_time'].iloc[-1]])\n",
" fig.update_layout(\n",
" title=dict(\n",
" text=\"Rolling window BTC-USDT\",\n",
" x=0.5,\n",
" xanchor='center'\n",
" ),\n",
" showlegend=False,\n",
" plot_bgcolor=\"rgb(250,250,250)\",\n",
" )\n",
" fig.show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# windows_1m_data = split_dataset_moving_window(\n",
"# trim_number_of_observations(btc_usdt_1_min_data, limit_time=LIMIT_TIME),\n",
"# NUM_MOVING_WINDOWS,\n",
"# in_sample_size=(24 * 30 * 24 * 60), # 24 months\n",
"# out_of_sample_size=(6 * 30 * 24 * 60) # 6 months\n",
"# )\n",
"# plot_moving_windows(btc_usdt_1_min_data, windows_1m_data)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# windows_5m_data = split_dataset_moving_window(\n",
"# trim_number_of_observations(btc_usdt_5_min_data, limit_time=LIMIT_TIME),\n",
"# NUM_MOVING_WINDOWS,\n",
"# in_sample_size=(24 * 30 * 24 * (60 // 5)), # 24 months\n",
"# out_of_sample_size=(6 * 30 * 24 * (60 // 5)) # 6 months\n",
"# )\n",
"# plot_moving_windows(btc_usdt_5_min_data, windows_5m_data)\n",
"# for window in windows_5m_data:\n",
"# print(window[0]['open_time'][0], window[0]['open_time'].iloc[-1])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"windows_15m_data = split_dataset_moving_window(\n",
" trim_number_of_observations(btc_usdt_15_min_data, limit_time=LIMIT_TIME),\n",
" NUM_MOVING_WINDOWS,\n",
" in_sample_size=(24 * 30 * 24 * (60 // 15)), # 24 months\n",
" out_of_sample_size=(6 * 30 * 24 * (60 // 15)) # 6 months\n",
")\n",
"\n",
"plot_moving_windows(btc_usdt_15_min_data, windows_15m_data)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# windows_30m_data = split_dataset_moving_window(\n",
"# trim_number_of_observations(btc_usdt_30_min_data, limit_time=LIMIT_TIME),\n",
"# NUM_MOVING_WINDOWS,\n",
"# in_sample_size=(24 * 30 * 24 * (60 // 30)), # 24 months\n",
"# out_of_sample_size=(6 * 30 * 24 * (60 // 30)) # 6 months\n",
"# )\n",
"\n",
"# plot_moving_windows(btc_usdt_30_min_data, windows_30m_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Upload splitted dataset to wandb"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import wandb\n",
"import os\n",
"from tempfile import TemporaryDirectory"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def upload_dataset_to_wandb(data_windows, dataset_name, project='wne-masters-thesis-testing'):\n",
" with TemporaryDirectory() as tempdir:\n",
" for i, (in_sample, out_sample) in enumerate(data_windows):\n",
" in_sample.to_csv(os.path.join(tempdir, dataset_name + f'-in-sample-{i}.csv'), index=False)\n",
" out_sample.to_csv(os.path.join(tempdir, dataset_name + f'-out-of-sample-{i}.csv'), index=False)\n",
" \n",
" wandb.init(\n",
" project=project,\n",
" job_type='upload_dataset')\n",
" artifact = wandb.Artifact(dataset_name, type=\"dataset\", metadata={\n",
" 'name': dataset_name,\n",
" 'num_windows': NUM_MOVING_WINDOWS\n",
" })\n",
" artifact.add_dir(tempdir)\n",
" wandb.log_artifact(artifact)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# WANDB_PROJECT = 'wne-masters-thesis-testing'\n",
"\n",
"# upload_dataset_to_wandb(windows_1m_data, 'btc-usdt-1m', project=WANDB_PROJECT)\n",
"# upload_dataset_to_wandb(windows_5m_data, 'btc-usdt-5m', project=WANDB_PROJECT)\n",
"# upload_dataset_to_wandb(windows_15m_data, 'btc-usdt-15m', project=WANDB_PROJECT)\n",
"# upload_dataset_to_wandb(windows_30m_data, 'btc-usdt-30m', project=WANDB_PROJECT)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test if time series dataset can be correctly constructed"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:root:Validating if time series dataset can be constructed from 5m data.\n",
"INFO:root:Validating if time series dataset can be constructed from 15m data.\n",
"INFO:root:Validating if time series dataset can be constructed from 30m data.\n"
]
}
],
"source": [
"from ml.data import build_time_series_dataset\n",
"config = {\n",
" 'data': {\n",
" 'fields': {\n",
" 'time_index': 'time_index',\n",
" 'target': 'close_price',\n",
" 'group_ids': ['group_id'],\n",
" 'static_real': [],\n",
" 'static_cat': [],\n",
" 'dynamic_cat': [],\n",
" 'dynamic_known_real': [],\n",
" 'dynamic_known_cat': [],\n",
" 'dynamic_unknown_real': ['returns'],\n",
" 'dynamic_unknown_cat': []\n",
" }\n",
" },\n",
" 'past_window': 2,\n",
" 'future_window': 2,\n",
" }\n",
"\n",
"logging.info(\"Validating if time series dataset can be constructed from 5m data.\")\n",
"for in_sample, out_of_sample in windows_5m_data:\n",
" build_time_series_dataset(config, in_sample)\n",
" build_time_series_dataset(config, out_of_sample)\n",
"\n",
"logging.info(\"Validating if time series dataset can be constructed from 15m data.\")\n",
"for in_sample, out_of_sample in windows_15m_data:\n",
" build_time_series_dataset(config, in_sample)\n",
" build_time_series_dataset(config, out_of_sample)\n",
"\n",
"logging.info(\"Validating if time series dataset can be constructed from 30m data.\")\n",
"for in_sample, out_of_sample in windows_30m_data:\n",
" build_time_series_dataset(config, in_sample)\n",
" build_time_series_dataset(config, out_of_sample)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timedelta('1831 days 23:59:00')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.Timestamp(year=2024, month=7, day=25, hour=23, minute=59) - pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.019178082191781"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"1832 / 365"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.002292576419213974"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"4.2 / 1832"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "wnemsc",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}