In [1]:
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import logging
import numpy as np
import talib

logging.basicConfig(level=logging.INFO)

## Load and preprocess BTC/USDT data
Using kline data downloaded from binance with 5 min interval.

In [2]:
EXPECTED_INTERVAL = pd.Timedelta(minutes=5)
EXPECTED_INTERNAL_INTERVAL = pd.Timedelta(minutes=4, seconds=59, milliseconds=999)

def fill_missing_values(data):
    num_new_intervals, num_fixed_intervals = 0, 0
    prev_row = data.iloc[0]
    preprocessed_data = [prev_row]
    for _, row in data.iloc[1:].iterrows():
        interval = row['open_time'] - prev_row['open_time']
        internal_interval = row['close_time'] - row['open_time']

        if internal_interval < EXPECTED_INTERNAL_INTERVAL:
            row['close_time'] = row['open_time'] + EXPECTED_INTERNAL_INTERVAL
            num_fixed_intervals += 1

        while interval != EXPECTED_INTERVAL:
            prev_row = prev_row.copy()
            prev_row['open_time'] = prev_row['open_time'] + EXPECTED_INTERVAL
            prev_row['close_time'] = prev_row['close_time'] + EXPECTED_INTERVAL

            preprocessed_data.append(prev_row)
            num_new_intervals += 1
            interval = row['open_time'] - prev_row['open_time']

        prev_row = row
        preprocessed_data.append(row)

    logging.info("Inserted %d new intervals.", num_new_intervals)
    logging.info("Fixed %d intervals.", num_fixed_intervals)

    return pd.DataFrame(preprocessed_data)

In [3]:
btc_usdt_data = pd.read_csv('../data/btc-usdt-5m.csv',
                           usecols=['Open time', 'Open price', 'High price', 'Low price', 'Close price', 'Volume', 'Close time'])
btc_usdt_data.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
btc_usdt_data['open_time'] = pd.to_datetime(btc_usdt_data['open_time'], unit='ms')
btc_usdt_data['close_time'] = pd.to_datetime(btc_usdt_data['close_time'], unit='ms')
btc_usdt_data = fill_missing_values(btc_usdt_data)

# Sanity check assertions
unique_intervals_open_time = (btc_usdt_data.shift(-1).dropna()['open_time'] - btc_usdt_data.iloc[:-1]['open_time']).unique()
unique_intervals_close_time = (btc_usdt_data.shift(-1).dropna()['close_time'] - btc_usdt_data.iloc[:-1]['close_time']).unique()
assert len(unique_intervals_open_time) == 1
assert len(unique_intervals_close_time) == 1
logging.info("There is the same interval between all open times: %s", unique_intervals_open_time[0])
logging.info("There is the same interval between all close times: %s", unique_intervals_close_time[0])

# Print statistics
logging.info("First data point: %s", btc_usdt_data['close_time'].iloc[0])
logging.info("Last data point: %s", btc_usdt_data['close_time'].iloc[-1])
logging.info("Number of data points: %d", len(btc_usdt_data))

# Plot data
go.Figure(go.Scatter(y=btc_usdt_data['close_price'].iloc[::10], x=btc_usdt_data['close_time'].iloc[::10])).show()

btc_usdt_data.head()

INFO:root:Inserted 1715 new intervals.
INFO:root:Fixed 17 intervals.
INFO:root:There is the same interval between all open times: 0 days 00:05:00
INFO:root:There is the same interval between all close times: 0 days 00:05:00
INFO:root:First data point: 2017-08-17 04:04:59.999000
INFO:root:Last data point: 2024-07-27 09:44:59.999000
INFO:root:Number of data points: 730437


Unnamed: 0,open_time,open_price,high_price,low_price,close_price,volume,close_time
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061,2017-08-17 04:04:59.999
1,2017-08-17 04:05:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:09:59.999
2,2017-08-17 04:10:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:14:59.999
3,2017-08-17 04:15:00,4261.48,4264.88,4261.48,4261.48,0.484666,2017-08-17 04:19:59.999
4,2017-08-17 04:20:00,4264.88,4266.29,4264.88,4266.29,2.32857,2017-08-17 04:24:59.999


## Load and preprocess VIX data

In [4]:
def preprocess_vix_data(data, vix_data):
    idx = 0
    result = []
    for _, row in data.iterrows():
        while idx + 1 < len(vix_data):
            # Loop until a day from data point, take value from previous day
            if vix_data['date'].iloc[idx + 1].date() >= row['close_time'].date():
                break
            idx +=1
        result.append(vix_data.iloc[idx])
    
    return pd.DataFrame(result).reset_index(drop=True)


# VIX daily data https://www.cboe.com/tradable_products/vix/vix_historical_data/
vix_data = pd.read_csv("../data/VIX_History.csv", usecols=["DATE", "CLOSE"])
vix_data.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
vix_data['date'] = pd.to_datetime(vix_data['date'])
vix_data = preprocess_vix_data(btc_usdt_data, vix_data)

# Augment data 
btc_usdt_data['vix_close_price'] = vix_data['close']

# Plot data
go.Figure(go.Scatter(y=btc_usdt_data['vix_close_price'].iloc[::100], x=btc_usdt_data['close_time'].iloc[::100])).show()

btc_usdt_data.head()


Unnamed: 0,open_time,open_price,high_price,low_price,close_price,volume,close_time,vix_close_price
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061,2017-08-17 04:04:59.999,11.74
1,2017-08-17 04:05:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:09:59.999,11.74
2,2017-08-17 04:10:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:14:59.999,11.74
3,2017-08-17 04:15:00,4261.48,4264.88,4261.48,4261.48,0.484666,2017-08-17 04:19:59.999,11.74
4,2017-08-17 04:20:00,4264.88,4266.29,4264.88,4266.29,2.32857,2017-08-17 04:24:59.999,11.74


## Load and preprocess FED data

In [5]:
def preprocess_fed_data(data, fed_data):
    idx = 0
    result = []
    for _, row in data.iterrows():
        while idx + 1 < len(fed_data):
            # Loop until a day from data point, take value from previous day
            if fed_data['date'].iloc[idx + 1].date() >= row['close_time'].date():
                break
            idx +=1
        result.append(fed_data.iloc[idx])
    
    return pd.DataFrame(result).reset_index(drop=True)


# VFED effective rates https://fred.stlouisfed.org/series/FEDFUNDS
fed_data = pd.read_csv('../data/FEDFUNDS.csv', usecols=["DATE", "FEDFUNDS"])
fed_data.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
fed_data['date'] = pd.to_datetime(fed_data['date'])
fed_data = preprocess_fed_data(btc_usdt_data, fed_data)

# Augment data 
btc_usdt_data['effective_rates'] = fed_data['fedfunds']

# Plot data
go.Figure(go.Scatter(y=btc_usdt_data['effective_rates'].iloc[::100], x=btc_usdt_data['close_time'].iloc[::100])).show()

btc_usdt_data.head()

Unnamed: 0,open_time,open_price,high_price,low_price,close_price,volume,close_time,vix_close_price,effective_rates
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061,2017-08-17 04:04:59.999,11.74,1.16
1,2017-08-17 04:05:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:09:59.999,11.74,1.16
2,2017-08-17 04:10:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:14:59.999,11.74,1.16
3,2017-08-17 04:15:00,4261.48,4264.88,4261.48,4261.48,0.484666,2017-08-17 04:19:59.999,11.74,1.16
4,2017-08-17 04:20:00,4264.88,4266.29,4264.88,4266.29,2.32857,2017-08-17 04:24:59.999,11.74,1.16


## Load and preprocess Crypto fear and greed index

In [6]:
def preprocess_feargreed_data(data, feargreed_data):
    idx = 0
    result = []
    for _, row in data.iterrows():
        while idx + 1 < len(feargreed_data):
            # Loop until a day from data point, take value from previous day
            if feargreed_data['timestamp'].iloc[idx + 1].date() >= row['close_time'].date():
                break
            idx +=1
        result.append(feargreed_data.iloc[idx])
    
    return pd.DataFrame(result).reset_index(drop=True)

# Fear/Greed index from https://alternative.me/crypto/fear-and-greed-index/
# fear_greed_data = pd.DataFrame(requests.get("https://api.alternative.me/fng/?limit=0").json()['data'])
# fear_greed_data.to_csv('../data/btcusdt_5m/fear_greed.csv', index=False)
feargreed_data = pd.read_csv("../data/fear_greed.csv", usecols=["timestamp", "value"])
feargreed_data['timestamp'] = pd.to_datetime(feargreed_data['timestamp'], unit='s')
feargreed_data = feargreed_data.sort_values('timestamp').reset_index(drop=True)
feargreed_data = preprocess_feargreed_data(btc_usdt_data, feargreed_data)


# Augment data 
btc_usdt_data['fear_greed_index'] = feargreed_data['value']

# # Plot data
go.Figure(go.Scatter(y=btc_usdt_data['fear_greed_index'].iloc[::100], x=btc_usdt_data['close_time'].iloc[::100])).show()

btc_usdt_data.head()
# feargreed_data.tail()

Unnamed: 0,open_time,open_price,high_price,low_price,close_price,volume,close_time,vix_close_price,effective_rates,fear_greed_index
0,2017-08-17 04:00:00,4261.48,4280.56,4261.48,4261.48,2.189061,2017-08-17 04:04:59.999,11.74,1.16,30
1,2017-08-17 04:05:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:09:59.999,11.74,1.16,30
2,2017-08-17 04:10:00,4261.48,4261.48,4261.48,4261.48,0.0,2017-08-17 04:14:59.999,11.74,1.16,30
3,2017-08-17 04:15:00,4261.48,4264.88,4261.48,4261.48,0.484666,2017-08-17 04:19:59.999,11.74,1.16,30
4,2017-08-17 04:20:00,4264.88,4266.29,4264.88,4266.29,2.32857,2017-08-17 04:24:59.999,11.74,1.16,30


## Augment dataset
Augment dataset with technical indicators and other additional data

In [36]:
def preprocess_augment_data(data):
    assert 'close_price' in data.columns
    assert 'close_time' in data.columns

    result = data.copy()
    result.reset_index(drop=True)

    # Required for pytorch forecasting framework
    result['time_index'] = result.index
    result['group_id'] = 'BTCUSDT'

    # Derrived variables
    result['hour'] = result['close_time'].apply(lambda x: x.hour)
    result['weekday'] = result['close_time'].apply(lambda x: x.weekday())
    result['open_to_close_price'] = result['open_price'] / result['close_price']
    result['high_to_close_price'] = result['high_price'] / result['close_price']
    result['low_to_close_price'] = result['low_price'] / result['close_price']
    result['high_to_low_price'] = result['high_price'] / result['low_price']

    # Resturns
    ret = result['close_price'].to_numpy()
    ret = (ret[1:] / ret[:-1]) - 1
    result['returns'] = np.pad(ret, (1, 0), 'constant', constant_values=(0, 0))

    # Returns binary
    result['returns_binary'] = (result['returns'] > 0).astype(np.int32)

    # Log returns
    log_ret = result['close_price'].to_numpy()
    log_ret = np.log(log_ret[1:]) - np.log(log_ret[:-1])
    result['log_returns'] = np.pad(log_ret, (1, 0), 'constant', constant_values=(0, 0))

    # ::Realized volatility
    cum_sum = result['log_returns'].to_numpy()
    cum_sum *= cum_sum
    cum_sum = np.cumsum(cum_sum)
    # 1h window - 12 observations
    result['vol_1h'] = np.sqrt(cum_sum - np.pad(cum_sum[:-12], (12, 0), 'constant', constant_values=(0, 0)))
    # 1d window - 12 * 24 = 288 observations
    result['vol_1d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-288], (288, 0), 'constant', constant_values=(0, 0)))
    # 7d window - 12 * 24 * 7 = 2016 observations
    result['vol_7d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-2016], (2016, 0), 'constant', constant_values=(0, 0)))

    close_price = result['close_price'].to_numpy()
    macd, signal, _ = talib.MACD(close_price, fastperiod=12, slowperiod=26, signalperiod=9)
    result['macd'] = macd
    result['macd'] = result['macd'].fillna(0)
    result['macd_signal'] = signal
    result['macd_signal'] = result['macd_signal'].fillna(0)

    rsi = talib.RSI(close_price, timeperiod=14)
    result['rsi'] = rsi
    result['rsi'] = result['rsi'].fillna(0)

    upper, middle, lower = talib.BBANDS(close_price, 20, 2.0, 2.0)
    result['low_bband_to_close_price'] = (lower / result['close_price']).fillna(0)
    result['up_bband_to_close_price'] = (upper / result['close_price']).fillna(0)
    result['mid_bband_to_close_price'] = (middle / result['close_price']).fillna(0)

    result['sma_1h_to_close_price'] = (talib.SMA(close_price) / result['close_price']).fillna(0)
    result['sma_1d_to_close_price'] = (talib.SMA(close_price) / result['close_price']).fillna(0)
    result['sma_7d_to_close_price'] = (talib.SMA(close_price) / result['close_price']).fillna(0)

    result['ema_1h_to_close_price'] = (talib.EMA(close_price) / result['close_price']).fillna(0)
    result['ema_1d_to_close_price'] = (talib.EMA(close_price) / result['close_price']).fillna(0)

    return result

btc_usdt_data = preprocess_augment_data(btc_usdt_data)

assert len(btc_usdt_data.dropna()) == len(btc_usdt_data) 

In [38]:
# Save processed dataset
btc_usdt_data.to_csv('../data/processed-btc-usdt-5m.csv', index=False)

## Dataset statistics

In [39]:
btc_usdt_data = pd.read_csv('../data/processed-btc-usdt-5m.csv')
btc_usdt_data['close_time'] = pd.to_datetime(btc_usdt_data['close_time'])
time_span_days = (btc_usdt_data['close_time'].iloc[-1] - btc_usdt_data['close_time'].iloc[0]).days

print("---- DATASET BTC-USDT ----")
print(f"Num observations: {len(btc_usdt_data)}")
print(f"First observation: {btc_usdt_data['close_time'].iloc[0]}")
print(f"Last observation: {btc_usdt_data['close_time'].iloc[-1]}")
print(f"Time span: {time_span_days} days ({time_span_days / 365:.2} years)")
print("::Variables::")
for col in btc_usdt_data.columns:
    print("-", col)

---- DATASET BTC-USDT ----
Num observations: 730437
First observation: 2017-08-17 04:04:59.999000
Last observation: 2024-07-27 09:44:59.999000
Time span: 2536 days (6.9 years)
::Variables::
- open_time
- open_price
- high_price
- low_price
- close_price
- volume
- close_time
- vix_close_price
- effective_rates
- fear_greed_index
- time_index
- group_id
- hour
- weekday
- open_to_close_price
- high_to_close_price
- low_to_close_price
- high_to_low_price
- returns
- returns_binary
- log_returns
- vol_1h
- vol_1d
- vol_7d
- macd
- macd_signal
- rsi
- low_bband_to_close_price
- up_bband_to_close_price
- mid_bband_to_close_price
- sma_1h_to_close_price
- sma_1d_to_close_price
- sma_7d_to_close_price
- ema_1h_to_close_price
- ema_1d_to_close_price


## Split dataset into moving window parts

In [40]:
def load_btcusdt_dataset(path):
    btc_usdt_data = pd.read_csv(path)
    btc_usdt_data['close_time'] = pd.to_datetime(btc_usdt_data['close_time'])
    btc_usdt_data['open_time'] = pd.to_datetime(btc_usdt_data['open_time'])

    return btc_usdt_data

btc_usdt_dataset = load_btcusdt_dataset('../data/processed-btc-usdt-5m.csv')

In [42]:
NUM_MOVING_WINDOWS = 5

def split_dataset_moving_window(
        data, 
        num_parts,
        in_sample_size=40000,
        out_of_sample_size=10000):
    result = []
    dataset_len = len(data)
    window_size = in_sample_size + out_of_sample_size

    print(f"In sample size: {in_sample_size}")
    print(f"Out of sample size: {out_of_sample_size}")
    
    for i in range(num_parts):
        part_len = dataset_len - i * out_of_sample_size
        in_sample_part = data.iloc[part_len - window_size: part_len - out_of_sample_size].reset_index(drop=True)
        out_of_sample_part = data.iloc[part_len - out_of_sample_size: part_len].reset_index(drop=True)
        result.append((in_sample_part, out_of_sample_part))
    
    # Return windows in time order
    return list(reversed(result))

windows = split_dataset_moving_window(
    btc_usdt_dataset,
    NUM_MOVING_WINDOWS,
    in_sample_size=(12 * 30 * 24 * 60) // 5, # 12 months
    out_of_sample_size=(3 * 30 * 24 * 60) // 5 # 3 months
)

In sample size: 103680
Out of sample size: 25920


In [44]:
fig = make_subplots(
    rows=NUM_MOVING_WINDOWS + 1,
    cols=1,
    row_heights=[0.05] * NUM_MOVING_WINDOWS + [1 - 0.05 * NUM_MOVING_WINDOWS],
    vertical_spacing=0.05,
    shared_xaxes=True)

for i, (in_sample, out_sample) in enumerate(windows):
    fig.add_trace(go.Bar(
        y = [1, 1],
        x = [in_sample['close_time'].iloc[0], out_sample['close_time'].iloc[0]],
        marker_color=['lightgrey', 'darkgrey'],
        width = [
           (in_sample['close_time'].iloc[-1] - in_sample['close_time'].iloc[0]).total_seconds()*1000,
           (out_sample['close_time'].iloc[-1] - out_sample['close_time'].iloc[0]).total_seconds()*1000
        ],
        offset=0
    ), row=NUM_MOVING_WINDOWS - i, col=1)

fig.add_trace(go.Scatter(
    y=btc_usdt_dataset['close_price'].iloc[::100],
    x=btc_usdt_dataset['close_time'].iloc[::100],
    marker_color='black',
), row=NUM_MOVING_WINDOWS+1, col=1)
fig.update_yaxes(showticklabels=False)
fig.update_yaxes(showticklabels=True, row=NUM_MOVING_WINDOWS+1, col=1)
fig.update_xaxes(range=[
    windows[0][0]['close_time'].iloc[0],
    windows[-1][-1]['close_time'].iloc[-1]])
fig.update_layout(
    title=dict(
        text="Rolling window BTC-USDT",
        x=0.5,
        xanchor='center'
    ),
    showlegend=False,
    plot_bgcolor="rgb(250,250,250)",
)
fig.show()

## Upload splitted dataset to wandb

In [45]:
import wandb
import os
from tempfile import TemporaryDirectory

In [None]:
WANDB_PROJECT = 'wne-masters-thesis-testing'
DATASET_NAME = 'btc-usdt-5m'

with TemporaryDirectory() as tempdir:
    for i, (in_sample, out_sample) in enumerate(windows):
        in_sample.to_csv(os.path.join(tempdir, DATASET_NAME + f'-in-sample-{i}.csv'), index=False)
        out_sample.to_csv(os.path.join(tempdir, DATASET_NAME + f'-out-of-sample-{i}.csv'), index=False)
    
    wandb.init(project=WANDB_PROJECT)
    artifact = wandb.Artifact(DATASET_NAME, type="dataset", metadata={
        'name': DATASET_NAME,
        'num_windows': NUM_MOVING_WINDOWS
    })
    artifact.add_dir(tempdir)
    wandb.log_artifact(artifact)