{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import plotly.graph_objs as go\n", "from plotly.subplots import make_subplots\n", "import logging\n", "import numpy as np\n", "import talib\n", "\n", "logging.basicConfig(level=logging.INFO)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load and preprocess BTC/USDT data\n", "Using kline data downloaded from binance with 5, 15 and 30 min interval." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def fill_missing_values(data, expected_interval):\n", "\n", " LOW_TIME = pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)\n", "\n", " data = data.loc[data['close_time'] >= LOW_TIME]\n", "\n", " \n", " num_new_intervals, num_fixed_intervals = 0, 0\n", " expected_internal_interval = expected_interval - pd.Timedelta(milliseconds=1)\n", " prev_row = data.iloc[0]\n", " preprocessed_data = [prev_row]\n", " for _, row in data.iloc[1:].iterrows():\n", " interval = row['open_time'] - prev_row['open_time']\n", " internal_interval = row['close_time'] - row['open_time']\n", "\n", " if internal_interval < expected_internal_interval:\n", " row['close_time'] = row['open_time'] + expected_internal_interval\n", " num_fixed_intervals += 1\n", "\n", " while interval != expected_interval:\n", " prev_row = prev_row.copy()\n", " prev_row['open_time'] = prev_row['open_time'] + expected_interval\n", " prev_row['close_time'] = prev_row['close_time'] + expected_interval\n", "\n", " preprocessed_data.append(prev_row)\n", " num_new_intervals += 1\n", " interval = row['open_time'] - prev_row['open_time']\n", "\n", " prev_row = row\n", " preprocessed_data.append(row)\n", "\n", " logging.info(\"Inserted %d new intervals.\", num_new_intervals)\n", " logging.info(\"Fixed %d intervals.\", num_fixed_intervals)\n", "\n", " result = pd.DataFrame(preprocessed_data)\n", " result = result.reset_index(drop=True)\n", " return result" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def load_btc_usdt_dataset(path, interval):\n", " logging.info(\"=\"*80)\n", " logging.info(\"Loading dataset '%s' with interval %s\", path, interval)\n", " btc_usdt_data = pd.read_csv(path,\n", " usecols=['Open time', 'Open price', 'High price', 'Low price', 'Close price', 'Volume', 'Close time'])\n", " btc_usdt_data.rename(columns=lambda x: x.lower().replace(\" \", \"_\"), inplace=True)\n", " btc_usdt_data['open_time'] = pd.to_datetime(btc_usdt_data['open_time'], unit='ms')\n", " btc_usdt_data['close_time'] = pd.to_datetime(btc_usdt_data['close_time'], unit='ms')\n", " btc_usdt_data = fill_missing_values(btc_usdt_data, interval)\n", "\n", " # Sanity check assertions\n", " unique_intervals_open_time = (btc_usdt_data.shift(-1).dropna()['open_time'] - btc_usdt_data.iloc[:-1]['open_time']).unique()\n", " unique_intervals_close_time = (btc_usdt_data.shift(-1).dropna()['close_time'] - btc_usdt_data.iloc[:-1]['close_time']).unique()\n", " assert len(unique_intervals_open_time) == 1\n", " assert len(unique_intervals_close_time) == 1\n", " logging.info(\"There is the same interval between all open times: %s\", unique_intervals_open_time[0])\n", " logging.info(\"There is the same interval between all close times: %s\", unique_intervals_close_time[0])\n", "\n", " # Print statistics\n", " logging.info(\"First data point: %s\", btc_usdt_data['close_time'].iloc[0])\n", " logging.info(\"Last data point: %s\", btc_usdt_data['close_time'].iloc[-1])\n", " logging.info(\"Number of data points: %d\", len(btc_usdt_data))\n", "\n", " # Plot data\n", " go.Figure(go.Scatter(y=btc_usdt_data['close_price'].iloc[::10], x=btc_usdt_data['close_time'].iloc[::10])).show()\n", "\n", " return btc_usdt_data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "INTERVAL_1_MIN = pd.Timedelta(minutes=1)\n", "INTERVAL_5_MIN = pd.Timedelta(minutes=5)\n", "INTERVAL_15_MIN = pd.Timedelta(minutes=15)\n", "INTERVAL_30_MIN = pd.Timedelta(minutes=30)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# btc_usdt_1_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-1m.csv', INTERVAL_1_MIN)\n", "# btc_usdt_1_min_data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# btc_usdt_5_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-5m.csv', INTERVAL_5_MIN)\n", "# btc_usdt_5_min_data.head()" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# btc_usdt_15_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-15m.csv', INTERVAL_15_MIN)\n", "# btc_usdt_15_min_data.head()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# btc_usdt_30_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-30m.csv', INTERVAL_30_MIN)\n", "# btc_usdt_30_min_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load and preprocess VIX data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "VIX_COL_NAME = 'vix_close_price'\n", "\n", "def preprocess_vix_data(data, vix_data):\n", " idx = 0\n", " result = []\n", " for _, row in data.iterrows():\n", " while idx + 1 < len(vix_data):\n", " # Loop until a day from data point, take value from previous day\n", " if vix_data['date'].iloc[idx + 1].date() >= row['close_time'].date():\n", " break\n", " idx +=1\n", " result.append(vix_data.iloc[idx])\n", " \n", " return pd.DataFrame(result).reset_index(drop=True)\n", "\n", "\n", "# VIX daily data https://www.cboe.com/tradable_products/vix/vix_historical_data/\n", "vix_data = pd.read_csv(\"../data/raw_data/VIX_History.csv\", usecols=[\"DATE\", \"CLOSE\"])\n", "vix_data.rename(columns=lambda x: x.lower().replace(\" \", \"_\"), inplace=True)\n", "vix_data['date'] = pd.to_datetime(vix_data['date'])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# vix_1_min_data = preprocess_vix_data(btc_usdt_1_min_data, vix_data)\n", "# btc_usdt_1_min_data[VIX_COL_NAME] = vix_1_min_data['close']\n", "# go.Figure(go.Scatter(y=btc_usdt_1_min_data['vix_close_price'].iloc[::10], x=btc_usdt_1_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_1_min_data.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# vix_5_min_data = preprocess_vix_data(btc_usdt_5_min_data, vix_data)\n", "# btc_usdt_5_min_data[VIX_COL_NAME] = vix_5_min_data['close']\n", "# go.Figure(go.Scatter(y=btc_usdt_5_min_data['vix_close_price'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_5_min_data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# vix_15_min_data = preprocess_vix_data(btc_usdt_15_min_data, vix_data)\n", "# btc_usdt_15_min_data[VIX_COL_NAME] = vix_15_min_data['close']\n", "# go.Figure(go.Scatter(y=btc_usdt_15_min_data['vix_close_price'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_15_min_data.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# vix_30_min_data = preprocess_vix_data(btc_usdt_30_min_data, vix_data)\n", "# btc_usdt_30_min_data[VIX_COL_NAME] = vix_30_min_data['close']\n", "# go.Figure(go.Scatter(y=btc_usdt_30_min_data['vix_close_price'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_30_min_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load and preprocess FED data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "FED_COL_NAME='effective_rates'\n", "\n", "def preprocess_fed_data(data, fed_data):\n", " idx = 0\n", " result = []\n", " for _, row in data.iterrows():\n", " while idx + 1 < len(fed_data):\n", " # Loop until a day from data point, take value from previous day\n", " if fed_data['date'].iloc[idx + 1].date() >= row['close_time'].date():\n", " break\n", " idx +=1\n", " result.append(fed_data.iloc[idx])\n", " \n", " return pd.DataFrame(result).reset_index(drop=True)\n", "\n", "\n", "# FED effective rates https://fred.stlouisfed.org/series/FEDFUNDS\n", "fed_data = pd.read_csv('../data/raw_data/FEDFUNDS.csv', usecols=[\"DATE\", \"FEDFUNDS\"])\n", "fed_data.rename(columns=lambda x: x.lower().replace(\" \", \"_\"), inplace=True)\n", "fed_data['date'] = pd.to_datetime(fed_data['date'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# fed_1_min_data = preprocess_fed_data(btc_usdt_1_min_data, fed_data)\n", "# btc_usdt_1_min_data[FED_COL_NAME] = fed_1_min_data['fedfunds']\n", "# go.Figure(go.Scatter(y=btc_usdt_1_min_data['effective_rates'].iloc[::10], x=btc_usdt_1_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_1_min_data.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# fed_5_min_data = preprocess_fed_data(btc_usdt_5_min_data, fed_data)\n", "# btc_usdt_5_min_data[FED_COL_NAME] = fed_5_min_data['fedfunds']\n", "# go.Figure(go.Scatter(y=btc_usdt_5_min_data['effective_rates'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_5_min_data.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# fed_15_min_data = preprocess_fed_data(btc_usdt_15_min_data, fed_data)\n", "# btc_usdt_15_min_data[FED_COL_NAME] = fed_15_min_data['fedfunds']\n", "# go.Figure(go.Scatter(y=btc_usdt_15_min_data['effective_rates'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_15_min_data.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# fed_30_min_data = preprocess_fed_data(btc_usdt_30_min_data, fed_data)\n", "# btc_usdt_30_min_data[FED_COL_NAME] = fed_30_min_data['fedfunds']\n", "# go.Figure(go.Scatter(y=btc_usdt_30_min_data['effective_rates'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_30_min_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load and preprocess Crypto fear and greed index" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "FEAR_GREED_COL_NAME = 'fear_greed_index'\n", "\n", "def preprocess_feargreed_data(data, feargreed_data):\n", " idx = 0\n", " result = []\n", " for _, row in data.iterrows():\n", " while idx + 1 < len(feargreed_data):\n", " # Loop until a day from data point, take value from previous day\n", " if feargreed_data['timestamp'].iloc[idx + 1].date() >= row['close_time'].date():\n", " break\n", " idx +=1\n", " result.append(feargreed_data.iloc[idx])\n", " \n", " return pd.DataFrame(result).reset_index(drop=True)\n", "\n", "# Fear/Greed index from https://alternative.me/crypto/fear-and-greed-index/\n", "# fear_greed_data = pd.DataFrame(requests.get(\"https://api.alternative.me/fng/?limit=0\").json()['data'])\n", "# fear_greed_data.to_csv('../data/btcusdt_5m/fear_greed.csv', index=False)\n", "feargreed_data = pd.read_csv(\"../data/raw_data/fear_greed.csv\", usecols=[\"timestamp\", \"value\"])\n", "feargreed_data['timestamp'] = pd.to_datetime(feargreed_data['timestamp'], unit='s')\n", "feargreed_data = feargreed_data.sort_values('timestamp').reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# feargreed_1_min_data = preprocess_feargreed_data(btc_usdt_1_min_data, feargreed_data)\n", "# btc_usdt_1_min_data[FEAR_GREED_COL_NAME] = feargreed_1_min_data['value']\n", "# go.Figure(go.Scatter(y=btc_usdt_1_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_1_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_1_min_data.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# feargreed_5_min_data = preprocess_feargreed_data(btc_usdt_5_min_data, feargreed_data)\n", "# btc_usdt_5_min_data[FEAR_GREED_COL_NAME] = feargreed_5_min_data['value']\n", "# go.Figure(go.Scatter(y=btc_usdt_5_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_5_min_data.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# feargreed_15_min_data = preprocess_feargreed_data(btc_usdt_15_min_data, feargreed_data)\n", "# btc_usdt_15_min_data[FEAR_GREED_COL_NAME] = feargreed_15_min_data['value']\n", "# go.Figure(go.Scatter(y=btc_usdt_15_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_15_min_data.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# feargreed_30_min_data = preprocess_feargreed_data(btc_usdt_30_min_data, feargreed_data)\n", "# btc_usdt_30_min_data[FEAR_GREED_COL_NAME] = feargreed_30_min_data['value']\n", "# go.Figure(go.Scatter(y=btc_usdt_30_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()\n", "# btc_usdt_30_min_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Augment dataset\n", "Augment dataset with technical indicators and other additional data" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def preprocess_augment_data(data, interval):\n", " assert 'close_price' in data.columns\n", " assert 'close_time' in data.columns\n", "\n", " result = data.copy()\n", "\n", " # Required for pytorch forecasting framework\n", " result['time_index'] = result.index\n", " result['group_id'] = 'BTCUSDT'\n", "\n", " # Derrived variables\n", " result['hour'] = result['close_time'].apply(lambda x: x.hour).astype('str')\n", " result['weekday'] = result['close_time'].apply(lambda x: x.weekday()).astype('str')\n", " result['open_to_close_price'] = result['open_price'] / result['close_price']\n", " result['high_to_close_price'] = result['high_price'] / result['close_price']\n", " result['low_to_close_price'] = result['low_price'] / result['close_price']\n", " result['high_to_low_price'] = result['high_price'] / result['low_price']\n", "\n", " # Resturns\n", " ret = result['close_price'].to_numpy()\n", " ret = (ret[1:] / ret[:-1]) - 1\n", " result['returns'] = np.pad(ret, (1, 0), 'constant', constant_values=(0, 0))\n", "\n", " # Returns binary\n", " result['returns_binary'] = (result['returns'] > 0).astype(np.int32)\n", "\n", " # Log returns\n", " log_ret = result['close_price'].to_numpy()\n", " log_ret = np.log(log_ret[1:]) - np.log(log_ret[:-1])\n", " result['log_returns'] = np.pad(log_ret, (1, 0), 'constant', constant_values=(0, 0))\n", "\n", " # ::Realized volatility\n", " cum_sum = result['log_returns'].to_numpy()\n", " cum_sum *= cum_sum\n", " cum_sum = np.cumsum(cum_sum)\n", " \n", " close_price = result['close_price'].to_numpy()\n", " # 1h window \n", " obs_1h = 1 * (60 // interval)\n", " result['vol_1h'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_1h], (obs_1h, 0), 'constant', constant_values=(0, 0)))\n", " result['sma_1h_to_close_price'] = (talib.SMA(close_price, obs_1h) / result['close_price']).fillna(0)\n", " result['ema_1h_to_close_price'] = (talib.EMA(close_price, obs_1h) / result['close_price']).fillna(0)\n", " # 1d window\n", " obs_1d = 24 * (60 // interval)\n", " result['vol_1d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_1d], (obs_1d, 0), 'constant', constant_values=(0, 0)))\n", " result['sma_1d_to_close_price'] = (talib.SMA(close_price, obs_1d) / result['close_price']).fillna(0)\n", " result['ema_1d_to_close_price'] = (talib.EMA(close_price, obs_1d) / result['close_price']).fillna(0)\n", " # 7d window\n", " obs_7d = 7 * 24 * (60 // interval)\n", " result['vol_7d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_7d], (obs_7d, 0), 'constant', constant_values=(0, 0)))\n", " result['sma_7d_to_close_price'] = (talib.SMA(close_price, obs_7d) / result['close_price']).fillna(0)\n", "\n", " macd, signal, _ = talib.MACD(close_price, fastperiod=12, slowperiod=26, signalperiod=9)\n", " result['macd'] = macd\n", " result['macd'] = result['macd'].fillna(0)\n", " result['macd_signal'] = signal\n", " result['macd_signal'] = result['macd_signal'].fillna(0)\n", "\n", " rsi = talib.RSI(close_price, timeperiod=14)\n", " result['rsi'] = rsi\n", " result['rsi'] = result['rsi'].fillna(0)\n", "\n", " upper, middle, lower = talib.BBANDS(close_price, 20, 2.0, 2.0)\n", " result['low_bband_to_close_price'] = (lower / result['close_price']).fillna(0)\n", " result['up_bband_to_close_price'] = (upper / result['close_price']).fillna(0)\n", " result['mid_bband_to_close_price'] = (middle / result['close_price']).fillna(0)\n", "\n", " assert len(result.dropna()) == len(result) \n", " assert len(result['time_index'].unique()) == len(result)\n", "\n", " return result" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "btc_usdt_1_min_data = preprocess_augment_data(btc_usdt_1_min_data, 1)\n", "# btc_usdt_5_min_data = preprocess_augment_data(btc_usdt_5_min_data, 5)\n", "# btc_usdt_15_min_data = preprocess_augment_data(btc_usdt_15_min_data, 15)\n", "# btc_usdt_30_min_data = preprocess_augment_data(btc_usdt_30_min_data, 30)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save preprocessed data" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "btc_usdt_1_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-1m.csv', index=False)\n", "# btc_usdt_5_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-5m.csv', index=False)\n", "# btc_usdt_15_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-15m.csv', index=False)\n", "# btc_usdt_30_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-30m.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset statistics" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def load_dataset(path):\n", " dataset = pd.read_csv(path)\n", " dataset['close_time'] = pd.to_datetime(dataset['close_time'])\n", " dataset['open_time'] = pd.to_datetime(dataset['open_time'])\n", " return dataset\n", "\n", "def print_dataset_stats(dataset, interval):\n", " time_span_days = (dataset['close_time'].\n", " iloc[-1] - dataset['close_time'].iloc[0]).days\n", " print(f\"---- DATASET BTC-USDT {interval} m ----\")\n", " print(f\"Num observations: {len(dataset)}\")\n", " print(f\"First observation: {dataset['close_time'].iloc[0]}\")\n", " print(f\"Last observation: {dataset['close_time'].iloc[-1]}\")\n", " print(f\"Time span: {time_span_days} days ({time_span_days / 365:.2} years)\")\n", " print(f\"Variables: {dataset.columns}\")\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "---- DATASET BTC-USDT 1 m ----\n", "Num observations: 2941560\n", "First observation: 2019-07-21 00:00:59.999000\n", "Last observation: 2025-02-21 17:59:59.999000\n", "Time span: 2042 days (5.6 years)\n", "Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n", " 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n", " 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n", " 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n", " 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n", " 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n", " 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n", " 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n", " 'low_bband_to_close_price', 'up_bband_to_close_price',\n", " 'mid_bband_to_close_price'],\n", " dtype='object')\n" ] } ], "source": [ "btc_usdt_1_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-1m.csv')\n", "print_dataset_stats(btc_usdt_1_min_data, 1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "---- DATASET BTC-USDT 5 m ----\n", "Num observations: 730437\n", "First observation: 2017-08-17 04:04:59.999000\n", "Last observation: 2024-07-27 09:44:59.999000\n", "Time span: 2536 days (6.9 years)\n", "Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n", " 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n", " 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n", " 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n", " 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n", " 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n", " 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n", " 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n", " 'low_bband_to_close_price', 'up_bband_to_close_price',\n", " 'mid_bband_to_close_price'],\n", " dtype='object')\n" ] } ], "source": [ "btc_usdt_5_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-5m.csv')\n", "print_dataset_stats(btc_usdt_5_min_data, 5)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "---- DATASET BTC-USDT 15 m ----\n", "Num observations: 248118\n", "First observation: 2017-08-17 04:14:59.999000\n", "Last observation: 2024-09-13 17:29:59.999000\n", "Time span: 2584 days (7.1 years)\n", "Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n", " 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n", " 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n", " 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n", " 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n", " 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n", " 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n", " 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n", " 'low_bband_to_close_price', 'up_bband_to_close_price',\n", " 'mid_bband_to_close_price'],\n", " dtype='object')\n" ] } ], "source": [ "btc_usdt_15_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-15m.csv')\n", "print_dataset_stats(btc_usdt_15_min_data, 15)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "---- DATASET BTC-USDT 30 m ----\n", "Num observations: 124059\n", "First observation: 2017-08-17 04:29:59.999000\n", "Last observation: 2024-09-13 17:29:59.999000\n", "Time span: 2584 days (7.1 years)\n", "Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',\n", " 'volume', 'close_time', 'vix_close_price', 'effective_rates',\n", " 'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',\n", " 'open_to_close_price', 'high_to_close_price', 'low_to_close_price',\n", " 'high_to_low_price', 'returns', 'returns_binary', 'log_returns',\n", " 'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',\n", " 'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',\n", " 'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',\n", " 'low_bband_to_close_price', 'up_bband_to_close_price',\n", " 'mid_bband_to_close_price'],\n", " dtype='object')\n" ] } ], "source": [ "btc_usdt_30_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-30m.csv')\n", "print_dataset_stats(btc_usdt_30_min_data, 30)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split dataset into moving window parts" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "NUM_MOVING_WINDOWS = 6\n", "LIMIT_TIME = pd.Timestamp(year=2024, month=7, day=25, hour=23, minute=59)\n", "LOW_TIME = pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)\n", "\n", "def trim_number_of_observations(data, limit_time):\n", " return data.loc[data['close_time'] <= limit_time]\n", "\n", "def split_dataset_moving_window(\n", " data, \n", " num_parts,\n", " in_sample_size,\n", " out_of_sample_size):\n", " result = []\n", " dataset_len = len(data)\n", " window_size = in_sample_size + out_of_sample_size\n", "\n", " print(f\"In sample size: {in_sample_size}\")\n", " print(f\"Out of sample size: {out_of_sample_size}\")\n", " \n", " for i in range(num_parts):\n", " part_len = dataset_len - i * out_of_sample_size\n", " in_sample_part = data.iloc[part_len - window_size: part_len - out_of_sample_size].reset_index(drop=True)\n", " out_of_sample_part = data.iloc[part_len - out_of_sample_size: part_len].reset_index(drop=True)\n", " result.append((in_sample_part, out_of_sample_part))\n", " \n", " # Return windows in time order\n", " return list(reversed(result))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def plot_moving_windows(data, windows):\n", "\n", " fig = make_subplots(\n", " rows=NUM_MOVING_WINDOWS + 1,\n", " cols=1,\n", " row_heights=[0.05] * NUM_MOVING_WINDOWS + [1 - 0.05 * NUM_MOVING_WINDOWS],\n", " vertical_spacing=0.05,\n", " shared_xaxes=True)\n", "\n", " for i, (in_sample, out_sample) in enumerate(windows):\n", " fig.add_trace(go.Bar(\n", " y = [1, 1],\n", " x = [in_sample['close_time'].iloc[0], out_sample['close_time'].iloc[0]],\n", " marker_color=['lightgrey', 'darkgrey'],\n", " width = [\n", " (in_sample['close_time'].iloc[-1] - in_sample['close_time'].iloc[0]).total_seconds()*1000,\n", " (out_sample['close_time'].iloc[-1] - out_sample['close_time'].iloc[0]).total_seconds()*1000\n", " ],\n", " offset=0\n", " ), row=NUM_MOVING_WINDOWS - i, col=1)\n", "\n", " fig.add_trace(go.Scatter(\n", " y=data['close_price'].iloc[::100],\n", " x=data['close_time'].iloc[::100],\n", " marker_color='black',\n", " ), row=NUM_MOVING_WINDOWS+1, col=1)\n", " fig.update_yaxes(showticklabels=False)\n", " fig.update_yaxes(showticklabels=True, row=NUM_MOVING_WINDOWS+1, col=1)\n", " fig.update_xaxes(range=[\n", " windows[0][0]['close_time'].iloc[0],\n", " windows[-1][-1]['close_time'].iloc[-1]])\n", " fig.update_layout(\n", " title=dict(\n", " text=\"Rolling window BTC-USDT\",\n", " x=0.5,\n", " xanchor='center'\n", " ),\n", " showlegend=False,\n", " plot_bgcolor=\"rgb(250,250,250)\",\n", " )\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# windows_1m_data = split_dataset_moving_window(\n", "# trim_number_of_observations(btc_usdt_1_min_data, limit_time=LIMIT_TIME),\n", "# NUM_MOVING_WINDOWS,\n", "# in_sample_size=(24 * 30 * 24 * 60), # 24 months\n", "# out_of_sample_size=(6 * 30 * 24 * 60) # 6 months\n", "# )\n", "# plot_moving_windows(btc_usdt_1_min_data, windows_1m_data)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# windows_5m_data = split_dataset_moving_window(\n", "# trim_number_of_observations(btc_usdt_5_min_data, limit_time=LIMIT_TIME),\n", "# NUM_MOVING_WINDOWS,\n", "# in_sample_size=(24 * 30 * 24 * (60 // 5)), # 24 months\n", "# out_of_sample_size=(6 * 30 * 24 * (60 // 5)) # 6 months\n", "# )\n", "# plot_moving_windows(btc_usdt_5_min_data, windows_5m_data)\n", "# for window in windows_5m_data:\n", "# print(window[0]['open_time'][0], window[0]['open_time'].iloc[-1])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "windows_15m_data = split_dataset_moving_window(\n", " trim_number_of_observations(btc_usdt_15_min_data, limit_time=LIMIT_TIME),\n", " NUM_MOVING_WINDOWS,\n", " in_sample_size=(24 * 30 * 24 * (60 // 15)), # 24 months\n", " out_of_sample_size=(6 * 30 * 24 * (60 // 15)) # 6 months\n", ")\n", "\n", "plot_moving_windows(btc_usdt_15_min_data, windows_15m_data)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# windows_30m_data = split_dataset_moving_window(\n", "# trim_number_of_observations(btc_usdt_30_min_data, limit_time=LIMIT_TIME),\n", "# NUM_MOVING_WINDOWS,\n", "# in_sample_size=(24 * 30 * 24 * (60 // 30)), # 24 months\n", "# out_of_sample_size=(6 * 30 * 24 * (60 // 30)) # 6 months\n", "# )\n", "\n", "# plot_moving_windows(btc_usdt_30_min_data, windows_30m_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Upload splitted dataset to wandb" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "import wandb\n", "import os\n", "from tempfile import TemporaryDirectory" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def upload_dataset_to_wandb(data_windows, dataset_name, project='wne-masters-thesis-testing'):\n", " with TemporaryDirectory() as tempdir:\n", " for i, (in_sample, out_sample) in enumerate(data_windows):\n", " in_sample.to_csv(os.path.join(tempdir, dataset_name + f'-in-sample-{i}.csv'), index=False)\n", " out_sample.to_csv(os.path.join(tempdir, dataset_name + f'-out-of-sample-{i}.csv'), index=False)\n", " \n", " wandb.init(\n", " project=project,\n", " job_type='upload_dataset')\n", " artifact = wandb.Artifact(dataset_name, type=\"dataset\", metadata={\n", " 'name': dataset_name,\n", " 'num_windows': NUM_MOVING_WINDOWS\n", " })\n", " artifact.add_dir(tempdir)\n", " wandb.log_artifact(artifact)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# WANDB_PROJECT = 'wne-masters-thesis-testing'\n", "\n", "# upload_dataset_to_wandb(windows_1m_data, 'btc-usdt-1m', project=WANDB_PROJECT)\n", "# upload_dataset_to_wandb(windows_5m_data, 'btc-usdt-5m', project=WANDB_PROJECT)\n", "# upload_dataset_to_wandb(windows_15m_data, 'btc-usdt-15m', project=WANDB_PROJECT)\n", "# upload_dataset_to_wandb(windows_30m_data, 'btc-usdt-30m', project=WANDB_PROJECT)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test if time series dataset can be correctly constructed" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:root:Validating if time series dataset can be constructed from 5m data.\n", "INFO:root:Validating if time series dataset can be constructed from 15m data.\n", "INFO:root:Validating if time series dataset can be constructed from 30m data.\n" ] } ], "source": [ "from ml.data import build_time_series_dataset\n", "config = {\n", " 'data': {\n", " 'fields': {\n", " 'time_index': 'time_index',\n", " 'target': 'close_price',\n", " 'group_ids': ['group_id'],\n", " 'static_real': [],\n", " 'static_cat': [],\n", " 'dynamic_cat': [],\n", " 'dynamic_known_real': [],\n", " 'dynamic_known_cat': [],\n", " 'dynamic_unknown_real': ['returns'],\n", " 'dynamic_unknown_cat': []\n", " }\n", " },\n", " 'past_window': 2,\n", " 'future_window': 2,\n", " }\n", "\n", "logging.info(\"Validating if time series dataset can be constructed from 5m data.\")\n", "for in_sample, out_of_sample in windows_5m_data:\n", " build_time_series_dataset(config, in_sample)\n", " build_time_series_dataset(config, out_of_sample)\n", "\n", "logging.info(\"Validating if time series dataset can be constructed from 15m data.\")\n", "for in_sample, out_of_sample in windows_15m_data:\n", " build_time_series_dataset(config, in_sample)\n", " build_time_series_dataset(config, out_of_sample)\n", "\n", "logging.info(\"Validating if time series dataset can be constructed from 30m data.\")\n", "for in_sample, out_of_sample in windows_30m_data:\n", " build_time_series_dataset(config, in_sample)\n", " build_time_series_dataset(config, out_of_sample)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timedelta('1831 days 23:59:00')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Timestamp(year=2024, month=7, day=25, hour=23, minute=59) - pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5.019178082191781" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "1832 / 365" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.002292576419213974" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "4.2 / 1832" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "wnemsc", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 2 }