In [1]:
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import logging
import numpy as np
import talib

logging.basicConfig(level=logging.INFO)

## Load and preprocess BTC/USDT data
Using kline data downloaded from binance with 5, 15 and 30 min interval.

In [2]:
def fill_missing_values(data, expected_interval):

    LOW_TIME = pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)

    data =  data.loc[data['close_time'] >= LOW_TIME]

    
    num_new_intervals, num_fixed_intervals = 0, 0
    expected_internal_interval = expected_interval - pd.Timedelta(milliseconds=1)
    prev_row = data.iloc[0]
    preprocessed_data = [prev_row]
    for _, row in data.iloc[1:].iterrows():
        interval = row['open_time'] - prev_row['open_time']
        internal_interval = row['close_time'] - row['open_time']

        if internal_interval < expected_internal_interval:
            row['close_time'] = row['open_time'] + expected_internal_interval
            num_fixed_intervals += 1

        while interval != expected_interval:
            prev_row = prev_row.copy()
            prev_row['open_time'] = prev_row['open_time'] + expected_interval
            prev_row['close_time'] = prev_row['close_time'] + expected_interval

            preprocessed_data.append(prev_row)
            num_new_intervals += 1
            interval = row['open_time'] - prev_row['open_time']

        prev_row = row
        preprocessed_data.append(row)

    logging.info("Inserted %d new intervals.", num_new_intervals)
    logging.info("Fixed %d intervals.", num_fixed_intervals)

    result = pd.DataFrame(preprocessed_data)
    result = result.reset_index(drop=True)
    return result

In [3]:
def load_btc_usdt_dataset(path, interval):
    logging.info("="*80)
    logging.info("Loading dataset '%s' with interval %s", path, interval)
    btc_usdt_data = pd.read_csv(path,
                            usecols=['Open time', 'Open price', 'High price', 'Low price', 'Close price', 'Volume', 'Close time'])
    btc_usdt_data.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
    btc_usdt_data['open_time'] = pd.to_datetime(btc_usdt_data['open_time'], unit='ms')
    btc_usdt_data['close_time'] = pd.to_datetime(btc_usdt_data['close_time'], unit='ms')
    btc_usdt_data = fill_missing_values(btc_usdt_data, interval)

    # Sanity check assertions
    unique_intervals_open_time = (btc_usdt_data.shift(-1).dropna()['open_time'] - btc_usdt_data.iloc[:-1]['open_time']).unique()
    unique_intervals_close_time = (btc_usdt_data.shift(-1).dropna()['close_time'] - btc_usdt_data.iloc[:-1]['close_time']).unique()
    assert len(unique_intervals_open_time) == 1
    assert len(unique_intervals_close_time) == 1
    logging.info("There is the same interval between all open times: %s", unique_intervals_open_time[0])
    logging.info("There is the same interval between all close times: %s", unique_intervals_close_time[0])

    # Print statistics
    logging.info("First data point: %s", btc_usdt_data['close_time'].iloc[0])
    logging.info("Last data point: %s", btc_usdt_data['close_time'].iloc[-1])
    logging.info("Number of data points: %d", len(btc_usdt_data))

    # Plot data
    go.Figure(go.Scatter(y=btc_usdt_data['close_price'].iloc[::10], x=btc_usdt_data['close_time'].iloc[::10])).show()

    return btc_usdt_data

In [4]:
INTERVAL_5_MIN = pd.Timedelta(minutes=5)
INTERVAL_15_MIN = pd.Timedelta(minutes=15)
INTERVAL_30_MIN = pd.Timedelta(minutes=30)

In [3]:
# btc_usdt_5_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-5m.csv', INTERVAL_5_MIN)
# btc_usdt_5_min_data.head()

In [1]:
# btc_usdt_15_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-15m.csv', INTERVAL_15_MIN)
# btc_usdt_15_min_data.head()

In [2]:
# btc_usdt_30_min_data = load_btc_usdt_dataset('../data/raw_data/btc-usdt-30m.csv', INTERVAL_30_MIN)
# btc_usdt_30_min_data.head()

## Load and preprocess VIX data

In [66]:
VIX_COL_NAME = 'vix_close_price'

def preprocess_vix_data(data, vix_data):
    idx = 0
    result = []
    for _, row in data.iterrows():
        while idx + 1 < len(vix_data):
            # Loop until a day from data point, take value from previous day
            if vix_data['date'].iloc[idx + 1].date() >= row['close_time'].date():
                break
            idx +=1
        result.append(vix_data.iloc[idx])
    
    return pd.DataFrame(result).reset_index(drop=True)


# VIX daily data https://www.cboe.com/tradable_products/vix/vix_historical_data/
vix_data = pd.read_csv("../data/raw_data/VIX_History.csv", usecols=["DATE", "CLOSE"])
vix_data.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
vix_data['date'] = pd.to_datetime(vix_data['date'])

In [4]:
# vix_5_min_data = preprocess_vix_data(btc_usdt_5_min_data, vix_data)
# btc_usdt_5_min_data[VIX_COL_NAME] = vix_5_min_data['close']
# go.Figure(go.Scatter(y=btc_usdt_5_min_data['vix_close_price'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()
# btc_usdt_5_min_data.head()

In [5]:
# vix_15_min_data = preprocess_vix_data(btc_usdt_15_min_data, vix_data)
# btc_usdt_15_min_data[VIX_COL_NAME] = vix_15_min_data['close']
# go.Figure(go.Scatter(y=btc_usdt_15_min_data['vix_close_price'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()
# btc_usdt_15_min_data.head()

In [6]:
# vix_30_min_data = preprocess_vix_data(btc_usdt_30_min_data, vix_data)
# btc_usdt_30_min_data[VIX_COL_NAME] = vix_30_min_data['close']
# go.Figure(go.Scatter(y=btc_usdt_30_min_data['vix_close_price'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()
# btc_usdt_30_min_data.head()

## Load and preprocess FED data

In [70]:
FED_COL_NAME='effective_rates'

def preprocess_fed_data(data, fed_data):
    idx = 0
    result = []
    for _, row in data.iterrows():
        while idx + 1 < len(fed_data):
            # Loop until a day from data point, take value from previous day
            if fed_data['date'].iloc[idx + 1].date() >= row['close_time'].date():
                break
            idx +=1
        result.append(fed_data.iloc[idx])
    
    return pd.DataFrame(result).reset_index(drop=True)


# FED effective rates https://fred.stlouisfed.org/series/FEDFUNDS
fed_data = pd.read_csv('../data/raw_data/FEDFUNDS.csv', usecols=["DATE", "FEDFUNDS"])
fed_data.rename(columns=lambda x: x.lower().replace(" ", "_"), inplace=True)
fed_data['date'] = pd.to_datetime(fed_data['date'])

In [7]:
# fed_5_min_data = preprocess_fed_data(btc_usdt_5_min_data, fed_data)
# btc_usdt_5_min_data[FED_COL_NAME] = fed_5_min_data['fedfunds']
# go.Figure(go.Scatter(y=btc_usdt_5_min_data['effective_rates'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()
# btc_usdt_5_min_data.head()

In [8]:
# fed_15_min_data = preprocess_fed_data(btc_usdt_15_min_data, fed_data)
# btc_usdt_15_min_data[FED_COL_NAME] = fed_15_min_data['fedfunds']
# go.Figure(go.Scatter(y=btc_usdt_15_min_data['effective_rates'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()
# btc_usdt_15_min_data.head()

In [9]:
# fed_30_min_data = preprocess_fed_data(btc_usdt_30_min_data, fed_data)
# btc_usdt_30_min_data[FED_COL_NAME] = fed_30_min_data['fedfunds']
# go.Figure(go.Scatter(y=btc_usdt_30_min_data['effective_rates'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()
# btc_usdt_30_min_data.head()

## Load and preprocess Crypto fear and greed index

In [74]:
FEAR_GREED_COL_NAME = 'fear_greed_index'

def preprocess_feargreed_data(data, feargreed_data):
    idx = 0
    result = []
    for _, row in data.iterrows():
        while idx + 1 < len(feargreed_data):
            # Loop until a day from data point, take value from previous day
            if feargreed_data['timestamp'].iloc[idx + 1].date() >= row['close_time'].date():
                break
            idx +=1
        result.append(feargreed_data.iloc[idx])
    
    return pd.DataFrame(result).reset_index(drop=True)

# Fear/Greed index from https://alternative.me/crypto/fear-and-greed-index/
# fear_greed_data = pd.DataFrame(requests.get("https://api.alternative.me/fng/?limit=0").json()['data'])
# fear_greed_data.to_csv('../data/btcusdt_5m/fear_greed.csv', index=False)
feargreed_data = pd.read_csv("../data/raw_data/fear_greed.csv", usecols=["timestamp", "value"])
feargreed_data['timestamp'] = pd.to_datetime(feargreed_data['timestamp'], unit='s')
feargreed_data = feargreed_data.sort_values('timestamp').reset_index(drop=True)

In [10]:
# feargreed_5_min_data = preprocess_feargreed_data(btc_usdt_5_min_data, feargreed_data)
# btc_usdt_5_min_data[FEAR_GREED_COL_NAME] = feargreed_5_min_data['value']
# go.Figure(go.Scatter(y=btc_usdt_5_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_5_min_data['close_time'].iloc[::10])).show()
# btc_usdt_5_min_data.head()

In [11]:
# feargreed_15_min_data = preprocess_feargreed_data(btc_usdt_15_min_data, feargreed_data)
# btc_usdt_15_min_data[FEAR_GREED_COL_NAME] = feargreed_15_min_data['value']
# go.Figure(go.Scatter(y=btc_usdt_15_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_15_min_data['close_time'].iloc[::10])).show()
# btc_usdt_15_min_data.head()

In [12]:
# feargreed_30_min_data = preprocess_feargreed_data(btc_usdt_30_min_data, feargreed_data)
# btc_usdt_30_min_data[FEAR_GREED_COL_NAME] = feargreed_30_min_data['value']
# go.Figure(go.Scatter(y=btc_usdt_30_min_data['fear_greed_index'].iloc[::10], x=btc_usdt_30_min_data['close_time'].iloc[::10])).show()
# btc_usdt_30_min_data.head()

## Augment dataset
Augment dataset with technical indicators and other additional data

In [91]:
def preprocess_augment_data(data, interval):
    assert 'close_price' in data.columns
    assert 'close_time' in data.columns

    result = data.copy()

    # Required for pytorch forecasting framework
    result['time_index'] = result.index
    result['group_id'] = 'BTCUSDT'

    # Derrived variables
    result['hour'] = result['close_time'].apply(lambda x: x.hour).astype('str')
    result['weekday'] = result['close_time'].apply(lambda x: x.weekday()).astype('str')
    result['open_to_close_price'] = result['open_price'] / result['close_price']
    result['high_to_close_price'] = result['high_price'] / result['close_price']
    result['low_to_close_price'] = result['low_price'] / result['close_price']
    result['high_to_low_price'] = result['high_price'] / result['low_price']

    # Resturns
    ret = result['close_price'].to_numpy()
    ret = (ret[1:] / ret[:-1]) - 1
    result['returns'] = np.pad(ret, (1, 0), 'constant', constant_values=(0, 0))

    # Returns binary
    result['returns_binary'] = (result['returns'] > 0).astype(np.int32)

    # Log returns
    log_ret = result['close_price'].to_numpy()
    log_ret = np.log(log_ret[1:]) - np.log(log_ret[:-1])
    result['log_returns'] = np.pad(log_ret, (1, 0), 'constant', constant_values=(0, 0))

    # ::Realized volatility
    cum_sum = result['log_returns'].to_numpy()
    cum_sum *= cum_sum
    cum_sum = np.cumsum(cum_sum)
    
    close_price = result['close_price'].to_numpy()
    # 1h window 
    obs_1h = 1 * (60 // interval)
    result['vol_1h'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_1h], (obs_1h, 0), 'constant', constant_values=(0, 0)))
    result['sma_1h_to_close_price'] = (talib.SMA(close_price, obs_1h) / result['close_price']).fillna(0)
    result['ema_1h_to_close_price'] = (talib.EMA(close_price, obs_1h) / result['close_price']).fillna(0)
    # 1d window
    obs_1d = 24 * (60 // interval)
    result['vol_1d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_1d], (obs_1d, 0), 'constant', constant_values=(0, 0)))
    result['sma_1d_to_close_price'] = (talib.SMA(close_price, obs_1d) / result['close_price']).fillna(0)
    result['ema_1d_to_close_price'] = (talib.EMA(close_price, obs_1d) / result['close_price']).fillna(0)
    # 7d window
    obs_7d = 7 * 24 * (60 // interval)
    result['vol_7d'] = np.sqrt(cum_sum - np.pad(cum_sum[:-obs_7d], (obs_7d, 0), 'constant', constant_values=(0, 0)))
    result['sma_7d_to_close_price'] = (talib.SMA(close_price, obs_7d) / result['close_price']).fillna(0)

    macd, signal, _ = talib.MACD(close_price, fastperiod=12, slowperiod=26, signalperiod=9)
    result['macd'] = macd
    result['macd'] = result['macd'].fillna(0)
    result['macd_signal'] = signal
    result['macd_signal'] = result['macd_signal'].fillna(0)

    rsi = talib.RSI(close_price, timeperiod=14)
    result['rsi'] = rsi
    result['rsi'] = result['rsi'].fillna(0)

    upper, middle, lower = talib.BBANDS(close_price, 20, 2.0, 2.0)
    result['low_bband_to_close_price'] = (lower / result['close_price']).fillna(0)
    result['up_bband_to_close_price'] = (upper / result['close_price']).fillna(0)
    result['mid_bband_to_close_price'] = (middle / result['close_price']).fillna(0)

    assert len(result.dropna()) == len(result) 
    assert len(result['time_index'].unique()) == len(result)

    return result

In [92]:
btc_usdt_5_min_data = preprocess_augment_data(btc_usdt_5_min_data, 5)
btc_usdt_15_min_data = preprocess_augment_data(btc_usdt_15_min_data, 15)
btc_usdt_30_min_data = preprocess_augment_data(btc_usdt_30_min_data, 30)


## Save preprocessed data

In [93]:
btc_usdt_5_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-5m.csv', index=False)
btc_usdt_15_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-15m.csv', index=False)
btc_usdt_30_min_data.to_csv('../data/preprocessed_data/processed-btc-usdt-30m.csv', index=False)

## Dataset statistics

In [2]:
def load_dataset(path):
    dataset = pd.read_csv(path)
    dataset['close_time'] = pd.to_datetime(dataset['close_time'])
    dataset['open_time'] = pd.to_datetime(dataset['open_time'])
    return dataset

def print_dataset_stats(dataset, interval):
    time_span_days = (dataset['close_time'].
    iloc[-1] - dataset['close_time'].iloc[0]).days
    print(f"---- DATASET BTC-USDT {interval} m ----")
    print(f"Num observations: {len(dataset)}")
    print(f"First observation: {dataset['close_time'].iloc[0]}")
    print(f"Last observation: {dataset['close_time'].iloc[-1]}")
    print(f"Time span: {time_span_days} days ({time_span_days / 365:.2} years)")
    print(f"Variables: {dataset.columns}")


In [3]:
btc_usdt_5_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-5m.csv')
print_dataset_stats(btc_usdt_5_min_data, 5)

---- DATASET BTC-USDT 5 m ----
Num observations: 730437
First observation: 2017-08-17 04:04:59.999000
Last observation: 2024-07-27 09:44:59.999000
Time span: 2536 days (6.9 years)
Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',
       'volume', 'close_time', 'vix_close_price', 'effective_rates',
       'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',
       'open_to_close_price', 'high_to_close_price', 'low_to_close_price',
       'high_to_low_price', 'returns', 'returns_binary', 'log_returns',
       'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',
       'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',
       'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',
       'low_bband_to_close_price', 'up_bband_to_close_price',
       'mid_bband_to_close_price'],
      dtype='object')


In [96]:
btc_usdt_15_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-15m.csv')
print_dataset_stats(btc_usdt_15_min_data, 15)

---- DATASET BTC-USDT 15 m ----
Num observations: 248118
First observation: 2017-08-17 04:14:59.999000
Last observation: 2024-09-13 17:29:59.999000
Time span: 2584 days (7.1 years)
Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',
       'volume', 'close_time', 'vix_close_price', 'effective_rates',
       'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',
       'open_to_close_price', 'high_to_close_price', 'low_to_close_price',
       'high_to_low_price', 'returns', 'returns_binary', 'log_returns',
       'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',
       'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',
       'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',
       'low_bband_to_close_price', 'up_bband_to_close_price',
       'mid_bband_to_close_price'],
      dtype='object')


In [4]:
btc_usdt_30_min_data = load_dataset('../data/preprocessed_data/processed-btc-usdt-30m.csv')
print_dataset_stats(btc_usdt_30_min_data, 30)

---- DATASET BTC-USDT 30 m ----
Num observations: 124059
First observation: 2017-08-17 04:29:59.999000
Last observation: 2024-09-13 17:29:59.999000
Time span: 2584 days (7.1 years)
Variables: Index(['open_time', 'open_price', 'high_price', 'low_price', 'close_price',
       'volume', 'close_time', 'vix_close_price', 'effective_rates',
       'fear_greed_index', 'time_index', 'group_id', 'hour', 'weekday',
       'open_to_close_price', 'high_to_close_price', 'low_to_close_price',
       'high_to_low_price', 'returns', 'returns_binary', 'log_returns',
       'vol_1h', 'sma_1h_to_close_price', 'ema_1h_to_close_price', 'vol_1d',
       'sma_1d_to_close_price', 'ema_1d_to_close_price', 'vol_7d',
       'sma_7d_to_close_price', 'macd', 'macd_signal', 'rsi',
       'low_bband_to_close_price', 'up_bband_to_close_price',
       'mid_bband_to_close_price'],
      dtype='object')


## Split dataset into moving window parts

In [5]:
NUM_MOVING_WINDOWS = 6
LIMIT_TIME = pd.Timestamp(year=2024, month=7, day=25, hour=23, minute=59)
LOW_TIME = pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)

def trim_number_of_observations(data, limit_time):
    return data.loc[data['close_time'] <= limit_time]

def split_dataset_moving_window(
        data, 
        num_parts,
        in_sample_size,
        out_of_sample_size):
    result = []
    dataset_len = len(data)
    window_size = in_sample_size + out_of_sample_size

    print(f"In sample size: {in_sample_size}")
    print(f"Out of sample size: {out_of_sample_size}")
    
    for i in range(num_parts):
        part_len = dataset_len - i * out_of_sample_size
        in_sample_part = data.iloc[part_len - window_size: part_len - out_of_sample_size].reset_index(drop=True)
        out_of_sample_part = data.iloc[part_len - out_of_sample_size: part_len].reset_index(drop=True)
        result.append((in_sample_part, out_of_sample_part))
    
    # Return windows in time order
    return list(reversed(result))

In [6]:
def plot_moving_windows(data, windows):

    fig = make_subplots(
        rows=NUM_MOVING_WINDOWS + 1,
        cols=1,
        row_heights=[0.05] * NUM_MOVING_WINDOWS + [1 - 0.05 * NUM_MOVING_WINDOWS],
        vertical_spacing=0.05,
        shared_xaxes=True)

    for i, (in_sample, out_sample) in enumerate(windows):
        fig.add_trace(go.Bar(
            y = [1, 1],
            x = [in_sample['close_time'].iloc[0], out_sample['close_time'].iloc[0]],
            marker_color=['lightgrey', 'darkgrey'],
            width = [
            (in_sample['close_time'].iloc[-1] - in_sample['close_time'].iloc[0]).total_seconds()*1000,
            (out_sample['close_time'].iloc[-1] - out_sample['close_time'].iloc[0]).total_seconds()*1000
            ],
            offset=0
        ), row=NUM_MOVING_WINDOWS - i, col=1)

    fig.add_trace(go.Scatter(
        y=data['close_price'].iloc[::100],
        x=data['close_time'].iloc[::100],
        marker_color='black',
    ), row=NUM_MOVING_WINDOWS+1, col=1)
    fig.update_yaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=True, row=NUM_MOVING_WINDOWS+1, col=1)
    fig.update_xaxes(range=[
        windows[0][0]['close_time'].iloc[0],
        windows[-1][-1]['close_time'].iloc[-1]])
    fig.update_layout(
        title=dict(
            text="Rolling window BTC-USDT",
            x=0.5,
            xanchor='center'
        ),
        showlegend=False,
        plot_bgcolor="rgb(250,250,250)",
    )
    fig.show()

In [8]:
windows_5m_data = split_dataset_moving_window(
    trim_number_of_observations(btc_usdt_5_min_data, limit_time=LIMIT_TIME),
    NUM_MOVING_WINDOWS,
    in_sample_size=(24 * 30 * 24 * (60 // 5)), # 24 months
    out_of_sample_size=(6 * 30 * 24 * (60 // 5)) # 6 months
)
plot_moving_windows(btc_usdt_5_min_data, windows_5m_data)

In [7]:
windows_15m_data = split_dataset_moving_window(
    trim_number_of_observations(btc_usdt_15_min_data, limit_time=LIMIT_TIME),
    NUM_MOVING_WINDOWS,
    in_sample_size=(24 * 30 * 24 * (60 // 15)), # 24 months
    out_of_sample_size=(6 * 30 * 24 * (60 // 15)) # 6 months
)

plot_moving_windows(btc_usdt_15_min_data, windows_15m_data)

In [13]:
# windows_30m_data = split_dataset_moving_window(
#     trim_number_of_observations(btc_usdt_30_min_data, limit_time=LIMIT_TIME),
#     NUM_MOVING_WINDOWS,
#     in_sample_size=(24 * 30 * 24 * (60 // 30)), # 24 months
#     out_of_sample_size=(6 * 30 * 24 * (60 // 30)) # 6 months
# )

# plot_moving_windows(btc_usdt_30_min_data, windows_30m_data)

## Upload splitted dataset to wandb

In [11]:
import wandb
import os
from tempfile import TemporaryDirectory

In [12]:
def upload_dataset_to_wandb(data_windows, dataset_name, project='wne-masters-thesis-testing'):
    with TemporaryDirectory() as tempdir:
        for i, (in_sample, out_sample) in enumerate(data_windows):
            in_sample.to_csv(os.path.join(tempdir, dataset_name + f'-in-sample-{i}.csv'), index=False)
            out_sample.to_csv(os.path.join(tempdir, dataset_name + f'-out-of-sample-{i}.csv'), index=False)
        
        wandb.init(
            project=project,
            job_type='upload_dataset')
        artifact = wandb.Artifact(dataset_name, type="dataset", metadata={
            'name': dataset_name,
            'num_windows': NUM_MOVING_WINDOWS
        })
        artifact.add_dir(tempdir)
        wandb.log_artifact(artifact)


In [16]:
WANDB_PROJECT = 'wne-masters-thesis-testing'

upload_dataset_to_wandb(windows_5m_data, 'btc-usdt-5m', project=WANDB_PROJECT)
upload_dataset_to_wandb(windows_15m_data, 'btc-usdt-15m', project=WANDB_PROJECT)
upload_dataset_to_wandb(windows_30m_data, 'btc-usdt-30m', project=WANDB_PROJECT)


[34m[1mwandb[0m: Adding directory to artifact (/var/folders/pz/gkm59rg174z0867wc4h3wd000000gn/T/tmpxkxgri0n)... Done. 0.6s


## Test if time series dataset can be correctly constructed

In [105]:
from ml.data import build_time_series_dataset
config = {
        'data': {
            'fields': {
                'time_index': 'time_index',
                'target': 'close_price',
                'group_ids': ['group_id'],
                'static_real': [],
                'static_cat': [],
                'dynamic_cat': [],
                'dynamic_known_real': [],
                'dynamic_known_cat': [],
                'dynamic_unknown_real': ['returns'],
                'dynamic_unknown_cat': []
            }
        },
        'past_window': 2,
        'future_window': 2,
    }

logging.info("Validating if time series dataset can be constructed from 5m data.")
for in_sample, out_of_sample in windows_5m_data:
    build_time_series_dataset(config, in_sample)
    build_time_series_dataset(config, out_of_sample)

logging.info("Validating if time series dataset can be constructed from 15m data.")
for in_sample, out_of_sample in windows_15m_data:
    build_time_series_dataset(config, in_sample)
    build_time_series_dataset(config, out_of_sample)

logging.info("Validating if time series dataset can be constructed from 30m data.")
for in_sample, out_of_sample in windows_30m_data:
    build_time_series_dataset(config, in_sample)
    build_time_series_dataset(config, out_of_sample)

INFO:root:Validating if time series dataset can be constructed from 5m data.
INFO:root:Validating if time series dataset can be constructed from 15m data.
INFO:root:Validating if time series dataset can be constructed from 30m data.


In [8]:
pd.Timestamp(year=2024, month=7, day=25, hour=23, minute=59) - pd.Timestamp(year=2019, month=7, day=21, hour=0, minute=0)

Timedelta('1831 days 23:59:00')

In [9]:
1832 / 365

5.019178082191781

In [10]:
4.2 / 1832

0.002292576419213974