from __future__ import annotations import copy from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Dict, Optional, cast import numpy as np import pandas as pd from cvttpy_tools.config import Config @dataclass class DataWindowParams: training_size_: int training_start_index_: int class ModelDataPolicy(ABC): config_: Config current_data_params_: DataWindowParams count_: int is_real_time_: bool def __init__(self, config: Config, *args: Any, **kwargs: Any): self.config_ = config self.current_data_params_ = DataWindowParams( training_size_=config.get_value("training_size", 120), training_start_index_=0, ) self.count_ = 0 self.is_real_time_ = kwargs.get("is_real_time", False) @abstractmethod def advance(self, mkt_data_df: Optional[pd.DataFrame] = None) -> DataWindowParams: self.count_ += 1 print(self.count_, end="\r") return self.current_data_params_ @staticmethod def create(config: Config, *args: Any, **kwargs: Any) -> ModelDataPolicy: import importlib model_data_policy_class_name = config.get_value("model_data_policy_class", None) assert model_data_policy_class_name is not None module_name, class_name = model_data_policy_class_name.rsplit(".", 1) module = importlib.import_module(module_name) model_training_data_policy_object = getattr(module, class_name)( config=config, *args, **kwargs ) return cast(ModelDataPolicy, model_training_data_policy_object) class RollingWindowDataPolicy(ModelDataPolicy): def __init__(self, config: Config, *args: Any, **kwargs: Any): super().__init__(config, *args, **kwargs) self.count_ = 1 def advance(self, mkt_data_df: Optional[pd.DataFrame] = None) -> DataWindowParams: super().advance(mkt_data_df) if self.is_real_time_: self.current_data_params_.training_start_index_ = -self.current_data_params_.training_size_ else: self.current_data_params_.training_start_index_ += 1 return self.current_data_params_ class OptimizedWndDataPolicy(ModelDataPolicy, ABC): mkt_data_df_: pd.DataFrame pair_: TradingPair # type: ignore min_training_size_: int max_training_size_: int end_index_: int prices_a_: np.ndarray prices_b_: np.ndarray def __init__(self, config: Config, *args: Any, **kwargs: Any): super().__init__(config, *args, **kwargs) assert ( kwargs.get("pair") is not None ), "pair must be provided" assert ( "min_training_size" in config.data() and "max_training_size" in config.data() ), "min_training_size and max_training_size must be provided" self.min_training_size_ = cast(int, config.get_value("min_training_size")) self.max_training_size_ = cast(int, config.get_value("max_training_size")) from pairs_trading.lib.pt_strategy.trading_pair import TradingPair self.pair_ = cast(TradingPair, kwargs.get("pair")) if "mkt_data" in kwargs: self.mkt_data_df_ = cast(pd.DataFrame, kwargs.get("mkt_data")) col_a, col_b = self.pair_.colnames() self.prices_a_ = np.array(self.mkt_data_df_[col_a]) self.prices_b_ = np.array(self.mkt_data_df_[col_b]) assert self.min_training_size_ < self.max_training_size_ def advance(self, mkt_data_df: Optional[pd.DataFrame] = None) -> DataWindowParams: super().advance(mkt_data_df) if mkt_data_df is not None: self.mkt_data_df_ = mkt_data_df if self.is_real_time_: self.end_index_ = len(self.mkt_data_df_) - 1 else: self.end_index_ = self.current_data_params_.training_start_index_ + self.max_training_size_ if self.end_index_ > len(self.mkt_data_df_) - 1: self.end_index_ = len(self.mkt_data_df_) - 1 self.current_data_params_.training_start_index_ = self.end_index_ - self.max_training_size_ if self.current_data_params_.training_start_index_ < 0: self.current_data_params_.training_start_index_ = 0 col_a, col_b = self.pair_.colnames() self.prices_a_ = np.array(self.mkt_data_df_[col_a]) self.prices_b_ = np.array(self.mkt_data_df_[col_b]) self.current_data_params_ = self.optimize_window_size() return self.current_data_params_ @abstractmethod def optimize_window_size(self) -> DataWindowParams: ... class EGOptimizedWndDataPolicy(OptimizedWndDataPolicy): ''' # Engle-Granger cointegration test *** VERY SLOW *** ''' def __init__(self, config: Config, *args: Any, **kwargs: Any): super().__init__(config, *args, **kwargs) def optimize_window_size(self) -> DataWindowParams: # Run Engle-Granger cointegration test last_pvalue = 1.0 result = copy.copy(self.current_data_params_) for trn_size in range(self.min_training_size_, self.max_training_size_): if self.end_index_ - trn_size < 0: break from statsmodels.tsa.stattools import coint # type: ignore start_index = self.end_index_ - trn_size series_a = self.prices_a_[start_index : self.end_index_] series_b = self.prices_b_[start_index : self.end_index_] eg_pvalue = float(coint(series_a, series_b)[1]) if eg_pvalue < last_pvalue: last_pvalue = eg_pvalue result.training_size_ = trn_size result.training_start_index_ = start_index # print( # f"*** DEBUG *** end_index={self.end_index_}, best_trn_size={self.current_data_params_.training_size}, {last_pvalue=}" # ) return result class ADFOptimizedWndDataPolicy(OptimizedWndDataPolicy): # Augmented Dickey-Fuller test def __init__(self, config: Config, *args: Any, **kwargs: Any): super().__init__(config, *args, **kwargs) def optimize_window_size(self) -> DataWindowParams: from statsmodels.regression.linear_model import OLS from statsmodels.tools.tools import add_constant from statsmodels.tsa.stattools import adfuller last_pvalue = 1.0 result = copy.copy(self.current_data_params_) for trn_size in range(self.min_training_size_, self.max_training_size_): if self.end_index_ - trn_size < 0: break start_index = self.end_index_ - trn_size y = self.prices_a_[start_index : self.end_index_] x = self.prices_b_[start_index : self.end_index_] # Add constant to x for intercept x_with_const = add_constant(x) # OLS regression: y = a + b*x + e model = OLS(y, x_with_const).fit() residuals = y - model.predict(x_with_const) # ADF test on residuals try: adf_result = adfuller(residuals, maxlag=1, regression="c") adf_pvalue = float(adf_result[1]) except Exception as e: # Handle edge cases with exception (e.g., constant series, etc.) adf_pvalue = 1.0 if adf_pvalue < last_pvalue: last_pvalue = adf_pvalue result.training_size_ = trn_size result.training_start_index_ = start_index # print( # f"*** DEBUG *** end_index={self.end_index_}," # f" best_trn_size={self.current_data_params_.training_size}," # f" {last_pvalue=}" # ) return result class JohansenOptdWndDataPolicy(OptimizedWndDataPolicy): # Johansen test def __init__(self, config: Config, *args: Any, **kwargs: Any): super().__init__(config, *args, **kwargs) def optimize_window_size(self) -> DataWindowParams: from statsmodels.tsa.vector_ar.vecm import coint_johansen import numpy as np best_stat = -np.inf best_trn_size = 0 best_start_index = -1 result = copy.copy(self.current_data_params_) for trn_size in range(self.min_training_size_, self.max_training_size_): if self.end_index_ - trn_size < 0: break start_index = self.end_index_ - trn_size series_a = self.prices_a_[start_index:self.end_index_] series_b = self.prices_b_[start_index:self.end_index_] # Combine into 2D matrix for Johansen test try: data = np.column_stack([series_a, series_b]) # Johansen test: det_order=0 (no deterministic trend), k_ar_diff=1 (lag) res = coint_johansen(data, det_order=0, k_ar_diff=1) # Trace statistic for cointegration rank 1 trace_stat = res.lr1[0] # test stat for rank=0 vs >=1 critical_value = res.cvt[0, 1] # 5% critical value if trace_stat > best_stat: best_stat = trace_stat best_trn_size = trn_size best_start_index = start_index except Exception: continue if best_trn_size > 0: result.training_size_ = best_trn_size result.training_start_index_ = best_start_index else: print("*** WARNING: No valid cointegration window found.") # print( # f"*** DEBUG *** end_index={self.end_index_}, best_trn_size={best_trn_size}, trace_stat={best_stat}" # ) return result