In [2]:
import wandb
import os
import pandas as pd
import plotly.graph_objs as go
from pytorch_forecasting.models.temporal_fusion_transformer import TemporalFusionTransformer
from pytorch_forecasting.data.timeseries import TimeSeriesDataSet
import numpy as np
import torch

  from tqdm.autonotebook import tqdm


In [3]:
def get_dataset_window(run, window=None):
    artifact_name = f"{run.project}/{run.config['data']['dataset']}"
    artifact = wandb.Api().artifact(artifact_name)
    base_path = artifact.download()

    name = artifact.metadata['name']
    in_sample_name = f"in-sample-{window or run.config['data']['sliding_window']}"
    in_sample_data = pd.read_csv(os.path.join(
        base_path, name + '-' + in_sample_name + '.csv'))
    out_of_sample_name = f"out-of-sample-{window or run.config['data']['sliding_window']}"
    out_of_sample_data = pd.read_csv(os.path.join(
        base_path, name + '-' + out_of_sample_name + '.csv'))

    return in_sample_data, out_of_sample_data

def get_train_validation_split(config, in_sample_data):
    validation_part = config['data']['validation']
    train_data = in_sample_data.iloc[:int(len(in_sample_data) * (1 - validation_part))]
    val_data = in_sample_data.iloc[len(train_data) - config['past_window']:]

    return train_data, val_data

def build_time_series_dataset(config, data):
    data = data.copy()
    data['weekday'] = data['weekday'].astype('str')
    data['hour'] = data['hour'].astype('str')

    time_series_dataset = TimeSeriesDataSet(
        data,
        time_idx=config['data']['fields']['time_index'],
        target=config['data']['fields']['target'],
        group_ids=config['data']['fields']['group_ids'],
        min_encoder_length=config['past_window'],
        max_encoder_length=config['past_window'],
        min_prediction_length=config['future_window'],
        max_prediction_length=config['future_window'],
        static_reals=config['data']['fields']['static_real'],
        static_categoricals=config['data']['fields']['static_cat'],
        time_varying_known_reals=config['data']['fields']['dynamic_known_real'],
        time_varying_known_categoricals=config['data']['fields']['dynamic_known_cat'],
        time_varying_unknown_reals=config['data']['fields']['dynamic_unknown_real'],
        time_varying_unknown_categoricals=config['data']['fields']['dynamic_unknown_cat'],
        randomize_length=False,
    )

    return time_series_dataset

In [4]:
# TODO: Maybe save all on cpu
def get_model(run):
    model_name = run.config['model']['name']
    model_path = f"{run.project}/model-{run.id}:best"
    model_artifact = wandb.Api().artifact(model_path)

    if model_name == 'TemporalFusionTransformer':
        return TemporalFusionTransformer.load_from_checkpoint(model_artifact.file())

    raise ValueError("Invalid model name")

In [5]:
RUN_ID = "filipstefaniuk/wne-masters-thesis-testing/r1gtdlsf"
PROJECT = "wne-masters-thesis-testing"
run = wandb.Api().run(RUN_ID)

In [6]:
in_sample, _ = get_dataset_window(run)
train_data, valid_data = get_train_validation_split(run.config, in_sample)
train = build_time_series_dataset(run.config, train_data)
valid = build_time_series_dataset(run.config, valid_data)
model = get_model(run)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Downloading large artifact btc-usdt-5m:latest, 310.03MB. 10 files... 
[34m[1mwandb[0m:   10 of 10 files downloaded.  
Done. 0:0:0.5
/usr/local/anaconda3/envs/wnemsc/lib/python3.9/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/usr/local/anaconda3/envs/wnemsc/lib/python3.9/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.


In [7]:
predictions = model.predict(
        valid.to_dataloader(train=False, batch_size=64),
        mode="raw",
        return_index=True,
        trainer_kwargs={
            'logger': False
        })

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


/usr/local/anaconda3/envs/wnemsc/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


In [10]:
from strategy.evaluation import evaluate_strategy
from strategy.strategy import BuyAndHoldStrategy, ModelReturnsPredictionStrategy

model_predictions = pd.DataFrame(predictions.index)
model_predictions['prediction'] = predictions.output.prediction.reshape(-1).numpy()

result_baseline = evaluate_strategy(valid_data, BuyAndHoldStrategy())
result_strategy = evaluate_strategy(valid_data, ModelReturnsPredictionStrategy(model_predictions))

print('Baseline returns', result_baseline['total_return'])
print('Strategy returns', result_strategy['total_return'])

go.Figure([
    go.Scatter(y=result_baseline['portfolio_value']),
    go.Scatter(y=result_strategy['portfolio_value'])]).show()

Baseline returns 0.23008643692049624
Strategy returns 0.09137899364187074


In [9]:
# Plot predictions
go.Figure([go.Scatter(y=model_predictions['prediction'])]).show()