diff --git a/gru_sac_predictor/src/trading_pipeline.py b/gru_sac_predictor/src/trading_pipeline.py
index 9bfd264b..f7815d1b 100644
--- a/gru_sac_predictor/src/trading_pipeline.py
+++ b/gru_sac_predictor/src/trading_pipeline.py
@@ -20,6 +20,8 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import torch # Added for SAC weight aggregation
 from collections import OrderedDict # Added for SAC weight aggregation
+import time
+import shutil
 
 # Determine the project root directory based on the script location
 # This assumes the script is in src/ and the project root is two levels up
@@ -71,9 +73,9 @@ import scipy.stats as st
 # --- Import edge_filtered_accuracy (Task 6.1/6.2) --- #
 try:
      # Ensure both metrics are imported
-     from .metrics import edge_filtered_accuracy, calculate_brier_score, _calculate_optimal_edge_threshold
+     from gru_sac_predictor.src.metrics import edge_filtered_accuracy, calculate_brier_score, _calculate_optimal_edge_threshold
 except ImportError:
-     logging.error("Failed to import metrics from .metrics. Validation check will fail.")
+     logging.error("Failed to import metrics from gru_sac_predictor.src.metrics. Validation check will fail.")
      # Define placeholders
      def edge_filtered_accuracy(*args, **kwargs): return np.nan, 0
      def calculate_brier_score(*args, **kwargs): return np.nan
@@ -81,185 +83,85 @@ except ImportError:
 # --- End Import --- #
 # --- End imports for baseline --- #
 
+# --- Import Stage Functions --- #
+from gru_sac_predictor.src.pipeline_stages.data_processing import (
+    load_and_preprocess,
+    engineer_features_for_fold,
+    define_labels_and_align_fold,
+    split_data_fold
+)
+from gru_sac_predictor.src.pipeline_stages.feature_processing import (
+    scale_features_fold,
+    select_features_fold,
+    prune_features_fold # Added import
+)
+from gru_sac_predictor.src.pipeline_stages.sequence_creation import (
+    create_sequences_fold # Added import
+)
+from gru_sac_predictor.src.pipeline_stages.evaluation import run_baseline_checks_fold # Added baseline check import
+from gru_sac_predictor.src.pipeline_stages.evaluation import run_gru_validation_checks_fold # Added validation check import
+from gru_sac_predictor.src.pipeline_stages.evaluation import run_backtest_fold # Import the new backtest function
+from gru_sac_predictor.src.pipeline_stages.modelling import train_or_load_gru_fold
+from gru_sac_predictor.src.pipeline_stages.modelling import calibrate_probabilities_fold
+from gru_sac_predictor.src.pipeline_stages.modelling import train_or_load_sac_fold
+from gru_sac_predictor.src.pipeline_stages.modelling import aggregate_sac_agents
+
 logger = logging.getLogger(__name__) # Use module-level logger
 
 # --- Refactored Label Generation Logic --- #
-def _generate_direction_labels(df: pd.DataFrame, config: dict) -> tuple[pd.DataFrame, str]:
-    """
-    Calculates forward returns and generates binary, soft binary, or ternary direction labels.
-
-    Args:
-        df (pd.DataFrame): DataFrame containing at least a 'close' column and DatetimeIndex.
-        config (dict): Pipeline configuration dictionary, expecting keys under 'gru' and 'data'.
-
-    Returns:
-        tuple[pd.DataFrame, str]: 
-            - DataFrame with added forward return and direction label columns.
-            - Name of the generated direction label column.
-    """
-    if 'close' not in df.columns:
-        raise ValueError("'close' column missing in input DataFrame for label generation.")
-    
-    gru_cfg = config.get('gru', {})
-    data_cfg = config.get('data', {})
-    horizon = gru_cfg.get('prediction_horizon', 5)
-    use_ternary = gru_cfg.get('use_ternary', False)
-
-    target_ret_col = f'fwd_log_ret_{horizon}'
-
-    # --- Calculate Forward Log Return --- #
-    shifted_close = df['close'].shift(-horizon)
-    fwd_returns = np.log(shifted_close / df['close'])
-    df[target_ret_col] = fwd_returns
-
-    # --- Generate Direction Label (Binary/Soft or Ternary) --- #
-    if use_ternary:
-        k = gru_cfg.get('flat_sigma_multiplier', 0.25)
-        target_dir_col = f'direction_label3_{horizon}' 
-        logging.info(f"Generating ternary labels ({target_dir_col}) with k={k}...")
-
-        sigma_n = fwd_returns.rolling(window=horizon, min_periods=max(1, horizon//2)).std()
-        eps = k * sigma_n
-
-        conditions = [fwd_returns > eps, fwd_returns < -eps]
-        choices = [2, 0] # 2=up, 0=down
-        ordinal_labels = np.select(conditions, choices, default=1).astype(int) # 1=flat
-
-        # --- Log Distribution & Check Balance --- #
-        # Temporarily add ordinal labels for check, handle NaNs from rolling sigma
-        df['_ordinal_label_temp'] = ordinal_labels
-        valid_mask_for_dist = ~np.isnan(eps) & ~np.isnan(fwd_returns)
-        ordinal_labels_valid = df.loc[valid_mask_for_dist, '_ordinal_label_temp']
-
-        if not ordinal_labels_valid.empty:
-            counts = np.bincount(ordinal_labels_valid, minlength=3)
-            total_valid = len(ordinal_labels_valid)
-            dist_pct = counts / total_valid * 100
-            log_msg = (f"Label dist (n={total_valid}): "
-                       f"Down(0)={dist_pct[0]:.1f}%, Flat(1)={dist_pct[1]:.1f}%, Up(2)={dist_pct[2]:.1f}%")
-            logging.info(log_msg)
-
-            min_pct_threshold = 10.0 # As per implementation
-            if any(p < min_pct_threshold for p in dist_pct):
-                error_msg = f"Label imbalance detected! Min class percentage is {np.min(dist_pct):.1f}% (Threshold: {min_pct_threshold}%). Check data or flat_sigma_multiplier (k={k})."
-                logging.error(error_msg)
-                # Consider raising or exiting - currently only logs/prints
-                print(f"ERROR: {error_msg}")
-        else:
-            logging.warning("Could not calculate label distribution (no valid sigma or returns).")
-        # --- End Distribution Check --- #
-
-        # --- One-hot encode --- #
-        try:
-            # Use the valid mask determined earlier
-            y_cat_full = np.full((len(df), 3), np.nan, dtype=np.float32)
-            if ordinal_labels_valid.empty:
-                 logging.warning("No valid ordinal labels to one-hot encode.")
-            else:
-                 y_cat_valid = to_categorical(ordinal_labels_valid, num_classes=3)
-                 y_cat_full[valid_mask_for_dist] = y_cat_valid.astype(np.float32)
-            
-            # Assign the list of arrays (or NaNs)
-            df[target_dir_col] = list(y_cat_full)
-
-        except Exception as e:
-            logging.error(f"Error during one-hot encoding: {e}", exc_info=True)
-            raise # Re-raise exception to halt pipeline if encoding fails
-        finally:
-            # Clean up temporary column regardless of success/failure
-            if '_ordinal_label_temp' in df.columns:
-                 df.drop(columns=['_ordinal_label_temp'], inplace=True)
-        # --- End One-hot Encoding --- #
-
-    else: # Binary / Soft Binary
-        target_dir_col = f'direction_label_{horizon}'
-        label_smoothing = data_cfg.get('label_smoothing', 0.0)
-        if not (0.0 <= label_smoothing < 1.0):
-            logging.warning(f"Invalid label_smoothing value ({label_smoothing}). Must be in [0.0, 1.0). Disabling smoothing.")
-            label_smoothing = 0.0
-
-        if label_smoothing > 0.0:
-            high_label = 1.0 - label_smoothing / 2.0
-            low_label = label_smoothing / 2.0
-            logging.info(f"Applying label smoothing: {label_smoothing:.2f} -> labels [{low_label:.2f}, {high_label:.2f}] for {target_dir_col}")
-            df[target_dir_col] = np.where(fwd_returns > 0, high_label, low_label).astype(np.float32)
-        else:
-            logging.info(f"Using hard binary labels (0.0 / 1.0) for {target_dir_col}")
-            df[target_dir_col] = (fwd_returns > 0).astype(np.float32)
-    
-    # --- Drop Rows with NaN Targets --- #
-    initial_rows = len(df)
-    
-    # Create mask for NaNs in the direction column (handle scalar or list/array NaNs)
-    if use_ternary:
-        # Check if elements are lists AND all values inside are NaN
-        nan_mask_dir = df[target_dir_col].apply(lambda x: isinstance(x, list) and np.all(np.isnan(x)))
-    else:
-        # Standard check for scalar NaN
-        nan_mask_dir = df[target_dir_col].isna()
-        
-    # Combine with NaN check for forward returns
-    nan_mask_combined = df[target_ret_col].isna() | nan_mask_dir
-    
-    df_clean = df[~nan_mask_combined].copy() # Use .copy() to avoid SettingWithCopyWarning later
-    
-    final_rows = len(df_clean)
-    if final_rows < initial_rows:
-        logging.info(f"Dropped {initial_rows - final_rows} rows due to NaN targets (horizon={horizon}).")
-
-    if df_clean.empty:
-         logging.error("DataFrame is empty after defining labels and dropping NaNs. Exiting.")
-         # Returning empty DataFrame, caller should handle exit
-         return pd.DataFrame(), target_dir_col 
-
-    return df_clean, target_dir_col
+# [Function _generate_direction_labels removed - Moved to data_processing.py]
 # --- End Refactored Label Generation --- #
 
 
 class TradingPipeline:
     """Orchestrates the entire trading strategy pipeline."""
 
-    def __init__(self, config_path: str, cli_args: argparse.Namespace = None, io_manager: Optional[Any] = None):
+    def __init__(self, config: dict, io_manager: Optional[Any] = None):
         """
         Initialize the pipeline with configuration, optional CLI args, and IOManager.
         
         Args:
-            config_path (str): Path to the configuration file.
-            cli_args (argparse.Namespace, optional): Parsed command-line arguments. Defaults to None.
+            config (dict): The loaded configuration dictionary.
             io_manager (IOManager, optional): Initialized IOManager instance. Defaults to None.
         """
-        self.config_path = config_path
-        self.config = self._load_config()
+        # Store the passed config dictionary directly
+        self.config = config
         # Run ID and Git SHA should be generated *before* logger/io setup in run.py
-        # If pipeline is instantiated directly, generate them here.
-        # TODO: Consider passing run_id and git_sha directly from run.py?
+        # and passed via the IOManager.
         if io_manager is None:
-            # Attempt to generate run_id if not provided via IOManager
-            try:
-                 from .utils.run_id import make_run_id, get_git_sha
-                 self.run_id = make_run_id()
-                 self.git_sha = get_git_sha(short=False) or "unknown"
-            except ImportError:
-                 # Fallback if run outside standard structure
-                 self.run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_fallback")
-                 self.git_sha = "unknown"
-            logger_to_use = logging # Use root logger if no io/logger setup provided
+            # IOManager is considered essential for proper operation.
+            # Raise an error or handle appropriately if not provided.
+            # For now, log critical error and exit, assuming IOManager is required.
+            # TODO: Decide final handling if IOManager *can* be optional.
+            logging.critical("IOManager not provided during TradingPipeline initialization. Cannot proceed.")
+            raise ValueError("IOManager instance is required for TradingPipeline.")
+            # Fallback removed - rely on IOManager
+            # self.run_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S_fallback")
+            # self.git_sha = "unknown"
+            # logger_to_use = logging # Use root logger if no io/logger setup provided
         else:
+             # --- Retrieve run_id and git_sha FROM io_manager ---
+             if not hasattr(io_manager, 'run_id') or not io_manager.run_id:
+                 raise ValueError("IOManager instance provided, but does not have a 'run_id' attribute set.")
              self.run_id = io_manager.run_id
-             # TODO: Pass git_sha via io_manager or constructor?
-             # For now, re-fetch it or get from src.__init__?
-             try: 
-                 from . import GIT_SHA
-                 self.git_sha = GIT_SHA
-             except ImportError:
-                 self.git_sha = "unknown" # Fallback
-             logger_to_use = logging.getLogger() # Assume logger was set up
-             
-        self.io = io_manager or self._setup_io_manual() # Use provided or setup manually
-        self.pipeline_version = "3.0.0" # Placeholder version
 
-        # --- Handle CLI Overrides --- #
-        # ... (rest of existing override logic) ...
+             # Assume git_sha is also an attribute of the initialized IOManager
+             if not hasattr(io_manager, 'git_sha') or not io_manager.git_sha:
+                 # logging.warning("IOManager instance does not have 'git_sha' attribute. Using 'unknown'.")
+                 # self.git_sha = "unknown" # Or raise error if mandatory
+                 # For now, let's assume it's mandatory for traceability
+                 raise ValueError("IOManager instance provided, but does not have a 'git_sha' attribute set.")
+             self.git_sha = io_manager.git_sha
+             # --- End Retrieval ---
+             
+             logger_to_use = logging.getLogger() # Assume logger was set up by IOManager/run.py
+             
+        # self.io is now guaranteed to be an IOManager instance if we proceed past checks
+        self.io = io_manager 
+        # Store git_sha on self as well, retrieved via IOManager
+        # Ensure git_sha exists on io_manager (already checked above, but good practice)
+        self.git_sha = getattr(io_manager, 'git_sha', 'unknown_in_pipeline') 
+        self.pipeline_version = "3.0.0" # Placeholder version
 
         # --- Directory Setup (Now handled by IOManager if provided) --- #
         if self.io:
@@ -280,16 +182,34 @@ class TradingPipeline:
              self._setup_logging_manual() # Fallback logging
         # --- End Directory Setup --- #
 
+        # --- Feature Whitelist Modification --- #
+        # Add 'bar_imputed' to the minimal whitelist if it's not already there
+        if 'bar_imputed' not in minimal_whitelist:
+             minimal_whitelist.append('bar_imputed')
+             logger.info("Added 'bar_imputed' to minimal_whitelist.")
+        # --- End Whitelist Modification --- #
+
         # Log Banner (Moved to run.py which has version info)
         # logger_to_use.info(...) 
 
         # --- Initialize Components --- #
-        self.data_loader = DataLoader(self.config)
-        self.feature_engineer = FeatureEngineer(self.config) 
-        self.calibrator = Calibrator(self.config) # Initialize Calibrator
+        # Extract db_directory from config before passing to DataLoader
+        data_cfg = self.config.get('data', {})
+        db_directory_path = data_cfg.get('db_dir', 'data/db') # Use 'db_dir' key, provide a default
+        if not db_directory_path or not isinstance(db_directory_path, str):
+             # Adjust error message to reflect the correct key 'db_dir'
+             logger.error(f"Invalid or missing 'db_dir' in config['data']. Found: {db_directory_path}. Using default 'data/db'.")
+             db_directory_path = 'data/db' # Fallback
+
+        self.data_loader = DataLoader(db_dir=db_directory_path)
+        self.feature_engineer = FeatureEngineer(self.config)
+        # Extract edge threshold for Calibrator initialization
+        calibration_cfg = self.config.get('calibration', {})
+        initial_edge_threshold = calibration_cfg.get('edge_threshold', 0.1) # Get edge from config
+        self.calibrator = Calibrator(edge_threshold=initial_edge_threshold)
         # --- Vector Calibrator (Task 4) --- #
         if VECTOR_CALIBRATOR_AVAILABLE:
-             self.vector_calibrator = VectorCalibrator(config=self.config)
+             self.vector_calibrator = VectorCalibrator() # Initialize without config
         else:
              self.vector_calibrator = None
         # --- End Vector Calibrator --- #
@@ -353,6 +273,16 @@ class TradingPipeline:
         self.use_ternary = self.config.get('gru', {}).get('use_ternary', False) # Cache ternary flag
         self.aggregated_metrics: Optional[dict] = None # Aggregated metrics across folds
         self.optimized_edge_threshold: Optional[float] = None # Store optimized edge threshold per fold
+        # --- Add attributes for baseline filtering --- #
+        self.fwd_returns_aligned: Optional[pd.Series] = None
+        self.eps_aligned: Optional[pd.Series] = None
+        # Attributes for split returns/eps needed by baseline check
+        self.fwd_ret_train: Optional[pd.Series] = None
+        self.eps_train: Optional[pd.Series] = None
+        self.fwd_ret_val: Optional[pd.Series] = None
+        self.eps_val: Optional[pd.Series] = None
+        self.y_dir_val_ordinal: Optional[pd.Series] = None # <<< ADDED
+        # --- End Add --- #
         # --- End Initialize state variables --- #
         
         # Save config handled by run.py via IOManager typically
@@ -404,1882 +334,186 @@ class TradingPipeline:
     # Remove original _setup_directories, _setup_logging, _save_run_config
     # Remove _generate_run_id (now done externally or via fallback)
 
-    def _load_config(self) -> dict:
-        """Loads the YAML configuration file."""
-        try:
-            # Try loading relative to the script first (if running from src)
-            if not os.path.isabs(self.config_path):
-                potential_path = os.path.join(script_dir, self.config_path)
-                if not os.path.exists(potential_path):
-                    # If not found relative to script, try relative to project root
-                    potential_path = os.path.join(project_root, self.config_path)
-                    if not os.path.exists(potential_path):
-                       # If still not found, try relative to CWD as last resort
-                       potential_path = os.path.abspath(self.config_path)
-                
-                if os.path.exists(potential_path):
-                     self.config_path = potential_path
-                else:
-                    # Try one level up from project root (common structure)
-                    potential_path = os.path.join(os.path.dirname(project_root), 'gru_sac_predictor', 'config.yaml')
-                    if os.path.exists(potential_path):
-                         self.config_path = potential_path
-                    else:
-                        raise FileNotFoundError(f"Config file not found at relative paths, CWD, or common location: {self.config_path}")
-            
-            # --- ADDED DEBUGGING --- 
-            logging.info(f"Attempting to load config from resolved path: {self.config_path}")
-            # --- END DEBUGGING --- 
-            
-            with open(self.config_path, 'r') as f:
-                config = yaml.safe_load(f)
-                
-            # --- ADDED DEBUGGING --- 
-            if isinstance(config, dict):
-                logging.info(f"Successfully loaded YAML. Top-level keys found: {list(config.keys())}")
-            else:
-                logging.warning(f"YAML loaded, but result is not a dictionary. Type: {type(config)}. Content snippet: {str(config)[:200]}")
-            # --- END DEBUGGING --- 
-
-            # Basic validation
-            if 'data' not in config or 'gru' not in config or 'sac' not in config:
-                raise ValueError("Config file missing essential sections: data, gru, sac")
-            # Validate calibration config if present
-            if 'calibration' in config and 'edge_threshold' not in config['calibration']:
-                 logging.warning("'edge_threshold' not found in calibration config, using default 0.55")
-                 config['calibration']['edge_threshold'] = 0.55 # Add default if missing
-            elif 'calibration' not in config:
-                 logging.warning("'calibration' section not found in config, using default edge_threshold 0.55")
-                 config['calibration'] = {'edge_threshold': 0.55} # Add default section
-                 
-            return config
-        except FileNotFoundError:
-            print(f"ERROR: Configuration file not found at '{self.config_path}'")
-            sys.exit(1)
-        except yaml.YAMLError as e:
-            print(f"ERROR: Error parsing configuration file '{self.config_path}': {e}")
-            sys.exit(1)
-        except Exception as e:
-            print(f"ERROR: An unexpected error occurred while loading config: {e}")
-            sys.exit(1)
-
     # --- Internal Pipeline Steps ---
     def load_and_preprocess_data(self):
-        """Loads and preprocesses data using DataLoader."""
-        logging.info("--- Stage: Loading and Preprocessing Data ---")
-        # Error handling for data_loader
-        if self.data_loader is None:
-            logging.error("DataLoader not initialized. Cannot load data.")
-            sys.exit(1)
-        
-        # Load data and summary
-        self.df_raw, self.load_summary = self.data_loader.load_data()
+        """Loads and preprocesses data by calling the stage function."""
+        logger.info("--- Calling Stage: Loading and Preprocessing Data ---")
+        df_raw, load_summary = load_and_preprocess(
+            data_loader=self.data_loader,
+            io=self.io,
+            run_id=self.run_id,
+            config=self.config
+        )
+        self.df_raw = df_raw
+        self.load_summary = load_summary
+        if self.df_raw is None:
+            logger.error("Data loading stage failed. Exiting pipeline.")
+            sys.exit(1) # Exit if loading failed
 
-        if self.df_raw is None or self.df_raw.empty:
-            logging.error("Data loading failed or returned empty DataFrame. Exiting.")
-            sys.exit(1)
-
-        # Calculate memory usage and log info
-        mem_usage = self.df_raw.memory_usage(deep=True).sum() / (1024**2)
-        if self.load_summary:
-            logging.info(f"Data loading summary: {self.load_summary}")
-        else:
-            logging.warning("No load summary returned by DataLoader.")
-        logging.info(f"Loaded data: {self.df_raw.shape[0]} rows, {self.df_raw.shape[1]} columns. Memory: {mem_usage:.2f} MB")
-        logging.info(f"Time range: {self.df_raw.index.min()} to {self.df_raw.index.max()}")
-
-        # --- V3 Output Contract: Stage 1 Artifacts ---
-        if self.io:
-            if self.load_summary:
-                # Add context to summary before saving
-                save_summary = self.load_summary.copy() # Don't modify original
-                save_summary['run_id'] = self.run_id
-                save_summary['timestamp_utc'] = datetime.now(timezone.utc).isoformat()
-                try:
-                    self.io.save_json(
-                        save_summary,
-                        "preprocess_summary",
-                        section='results',
-                        use_txt=True # Save as .txt as requested
-                    )
-                    logging.info("Saved preprocessing summary to results/<run_id>/preprocess_summary.txt")
-                except Exception as e:
-                    logging.error(f"Failed to save preprocessing summary using IOManager: {e}")
-            else:
-                 logging.warning("Load summary dictionary is None, cannot save preprocess_summary.txt")
-
-            if self.df_raw is not None and not self.df_raw.empty:
-                 try:
-                      self.io.save_df(
-                           self.df_raw.head(20),
-                           "head_preprocessed",
-                           section='results'
-                      )
-                      logging.info("Saved head of preprocessed data to results/<run_id>/head_preprocessed.{csv/parquet}")
-                 except Exception as e:
-                      logging.error(f"Failed to save head of preprocessed data using IOManager: {e}")
-            else:
-                 logging.warning("Raw dataframe (df_raw) is None or empty, cannot save head_preprocessed.")
-
-        else:
-            logging.warning("IOManager not available, skipping saving of Stage 1 artifacts (preprocess_summary, head_preprocessed).")
-        # --- End V3 Output Contract ---
-
-        # --- V3 Output Contract: Stage 2 Artifact (Label Histogram) ---
-        if self.io and self.config.get('control', {}).get('generate_plots', True):
-            logging.info("Generating training label distribution histogram...")
-            try:
-                # Get the target directory column name (handle ternary/binary)
-                horizon = self.config['gru'].get('prediction_horizon', 5)
-                target_dir_col = f'direction_label3_{horizon}' if self.use_ternary else f'direction_label_{horizon}'
-                
-                if target_dir_col not in self.y_train.columns:
-                     logging.error(f"Target column '{target_dir_col}' not found in y_train. Cannot generate label histogram.")
-                elif self.y_train.empty:
-                     logging.warning("y_train is empty. Skipping label histogram.")
-                else:
-                    # Prepare data for plotting
-                    if self.use_ternary:
-                        # Convert one-hot back to ordinal for counting
-                        labels_ordinal = np.argmax(np.stack(self.y_train[target_dir_col].values), axis=1)
-                        label_counts = pd.Series(labels_ordinal).value_counts().sort_index()
-                        class_names = ['Down (0)', 'Flat (1)', 'Up (2)']
-                        # Ensure all classes are present, even if count is 0
-                        label_counts = label_counts.reindex([0, 1, 2], fill_value=0)
-                        title_suffix = f" (ε multiplier k={self.config.get('gru', {}).get('flat_sigma_multiplier', 'N/A')})"
-                    else: # Binary
-                        labels_ordinal = self.y_train[target_dir_col]
-                        label_counts = labels_ordinal.value_counts().sort_index()
-                        # Map 0/1 or smoothed values to names
-                        # Simple approach: Count values close to 0 as Down, close to 1 as Up
-                        down_count = (labels_ordinal < 0.5).sum()
-                        up_count = (labels_ordinal >= 0.5).sum()
-                        label_counts = pd.Series([down_count, up_count], index=[0, 1])
-                        class_names = ['Down (0)', 'Up (1)']
-                        title_suffix = ""
-
-                    # Get figure settings
-                    fig_dpi = self.config.get('output', {}).get('figure_dpi', 150)
-                    fig_size = self.config.get('output', {}).get('figure_size', [16, 9])
-                    footer_text = "© GRU-SAC v3"
-                    
-                    plt.style.use('seaborn-v0_8-darkgrid')
-                    fig, ax = plt.subplots(figsize=fig_size)
-                    
-                    bars = ax.bar(class_names, label_counts.values, color=sns.color_palette('viridis', len(class_names)))
-                    
-                    # Add percentages on bars
-                    total_samples = label_counts.sum()
-                    if total_samples > 0:
-                        for bar in bars:
-                            height = bar.get_height()
-                            percentage = f'{(height / total_samples) * 100:.1f}%'
-                            ax.annotate(percentage, 
-                                        xy=(bar.get_x() + bar.get_width() / 2, height),
-                                        xytext=(0, 3), # 3 points vertical offset
-                                        textcoords="offset points",
-                                        ha='center', va='bottom', fontsize=10)
-
-                    ax.set_ylabel('Count', fontsize=12)
-                    ax.set_title(f'Training Set Label Distribution{title_suffix}', fontsize=16)
-                    ax.tick_params(axis='x', rotation=0, labelsize=10)
-                    ax.tick_params(axis='y', labelsize=10)
-                    ax.spines['top'].set_visible(False)
-                    ax.spines['right'].set_visible(False)
-                    
-                    # Add footer
-                    plt.figtext(0.99, 0.01, footer_text, horizontalalignment='right', 
-                                verticalalignment='bottom', fontsize=8, color='gray')
-
-                    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
-                    
-                    # Save figure using IOManager
-                    self.io.save_figure(fig, "label_histogram", section='results')
-                    logging.info("Training label histogram saved.")
-                    plt.close(fig)
-
-            except Exception as e:
-                logging.error(f"Failed to generate or save training label histogram: {e}", exc_info=True)
-        elif not self.io:
-             logging.warning("IOManager not available, skipping training label histogram.")
-        # --- End V3 Output Contract ---
-
-    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Adds features using FeatureEngineer."""
-        logging.info("--- Stage: Engineering Features --- ")
-        if df is None or df.empty:
-            logging.error("Input DataFrame is empty. Cannot engineer features.")
-            # Return empty DataFrame to indicate failure
-            return pd.DataFrame()
-        
-        # Add base features (cyclical, imbalance, TA)
-        # Ensure FeatureEngineer.add_base_features accepts df
-        df_engineered = self.feature_engineer.add_base_features(df.copy())
-
-        # Drop rows with NaNs potentially introduced by feature engineering
-        initial_rows = len(df_engineered)
-        df_engineered.dropna(inplace=True)
-        if len(df_engineered) < initial_rows:
-             logging.warning(f"Dropped {initial_rows - len(df_engineered)} rows with NaN values after feature engineering.")
+    def engineer_features(self, df_input: pd.DataFrame) -> pd.DataFrame:
+        """Adds features for a fold by calling the stage function."""
+        logger.info("--- Calling Stage: Engineering Features --- ")
+        # Call the refactored function
+        # --- Added missing io and config arguments --- #
+        df_engineered = engineer_features_for_fold(
+            df=df_input, 
+            feature_engineer=self.feature_engineer,
+            io=self.io,          # Pass the IOManager instance
+            config=self.config,    # Pass the pipeline config
+            target_col=self.target_columns[0] if hasattr(self, 'target_columns') and self.target_columns else None # Pass target for sorting corr plot
+        )
+        # --- End argument addition --- #
 
+        # Return the result (no need to store on self.df_engineered_full here,
+        # the execute method will handle passing it to the next stage)
         if df_engineered.empty:
-             logging.error("DataFrame is empty after feature engineering and NaN removal.")
-             # Return empty DataFrame
-             return pd.DataFrame()
+            logger.error("Feature engineering stage failed or resulted in empty DataFrame.")
+            # Depending on context (fold vs full), might need sys.exit or return empty
+            # For now, return the empty df, caller should handle.
 
-        logging.info(f"Feature engineering complete for this fold. Shape: {df_engineered.shape}")
         return df_engineered
 
     def define_labels_and_align(self, df_engineered: pd.DataFrame) -> Tuple[pd.DataFrame, str, List[str]]:
-        """Defines prediction labels (returns, direction) and aligns with features for a given DataFrame."""
-        logging.info("--- Stage: Defining Labels and Aligning for Fold --- ")
-        if df_engineered is None or df_engineered.empty:
-            logging.error("Engineered data (DataFrame) is empty. Cannot define labels.")
-            # Return empty tuple to signal failure
-            return pd.DataFrame(), "", [] 
-
-        # --- Call the refactored label generation function --- #
-        try:
-            # Pass the df_engineered directly
-            df_labeled_aligned, target_dir_col = _generate_direction_labels(
-                df_engineered.copy(), # Pass a copy 
-                self.config
-            )
-        except Exception as e:
-             logging.error(f"Label generation failed for fold: {e}.", exc_info=True)
-             # Return empty tuple
-             return pd.DataFrame(), "", []
+        """Defines prediction labels and aligns features for a fold by calling the stage function."""
+        logger.info("--- Calling Stage: Defining Labels and Aligning --- ")
+        # Call the refactored function from data_processing module
+        # --- Updated call to capture fwd_returns and eps --- #
+        df_labeled_aligned, target_dir_col, target_cols, fwd_returns_aligned, eps_aligned = define_labels_and_align_fold(
+            df_engineered=df_engineered, # Pass the engineered data for the fold
+            config=self.config
+        )
+        # --- End Updated call --- #
 
+        # Check for failure
         if df_labeled_aligned.empty:
-            logging.error("Label generation resulted in an empty DataFrame for fold.")
-            # Return empty tuple
+            logger.error("Label definition and alignment stage failed.")
+            # Return empty tuple to signal failure upstream
+            # Note: The stage function now returns empty series/None on failure too
             return pd.DataFrame(), "", []
-        # --- End Label Generation Call --- #
-             
-        # Separate features (X) and targets (y) 
-        horizon = self.config['gru'].get('prediction_horizon', 5)
-        target_ret_col = f'fwd_log_ret_{horizon}'
-        target_cols = [target_ret_col, target_dir_col]
-        
-        # Ensure the columns actually exist
-        if not all(col in df_labeled_aligned.columns for col in target_cols):
-             logging.error(f"Generated label/return columns ({target_cols}) not found in DataFrame after label generation for fold.")
-             # Return empty tuple
-             return pd.DataFrame(), "", []
-             
-        # We don't need to store X_raw_aligned, y_aligned etc. on self here,
-        # as the split_data method will operate on df_labeled_aligned
-        # We just need to return the result and the target column names.
-        logging.info(f"Labels defined and aligned for fold. Shape: {df_labeled_aligned.shape}")
 
-        # Return the labeled/aligned df, target dir col, and all target cols
+        # Store the target column names on self for use in subsequent steps (like split_data)
+        self.target_dir_col = target_dir_col
+        self.target_columns = target_cols
+        # --- Store aligned returns and eps on self --- #
+        self.fwd_returns_aligned = fwd_returns_aligned
+        self.eps_aligned = eps_aligned
+        # --- End Store --- #
+
+        # Return the main results to the caller (execute method)
         return df_labeled_aligned, target_dir_col, target_cols
 
-        # --- Remove plot generation from here, move to end-of-run if needed --- # 
-        # Heatmap generation doesn't make sense per-fold usually.
+    def split_data(self, df_labeled_aligned_fold: pd.DataFrame, fold_dates: Optional[Tuple] = None):
+        """Splits data for a fold by calling the stage function."""
+        logger.info(f"--- Calling Stage: Splitting Data for Fold {self.current_fold} --- ")
 
-    def split_data(self):
-        """Splits features and targets into train, validation, and test sets chronologically."""
-        logging.info("--- Stage: Splitting Data ---")
-        if self.X_raw_aligned is None or self.y_aligned is None:
-            logging.error("Aligned features/targets not available for splitting.")
-            sys.exit(1)
-        if not isinstance(self.X_raw_aligned.index, pd.DatetimeIndex):
-             logging.error("Feature index must be DatetimeIndex for chronological split. Aborting.")
-             sys.exit(1)
-
-        split_cfg = self.config['split_ratios']
-        train_ratio = split_cfg['train']
-        val_ratio = split_cfg['validation']
-        test_ratio = round(1.0 - train_ratio - val_ratio, 2)
-        logger.info(f"Using split ratios: Train={train_ratio:.2f}, Val={val_ratio:.2f}, Test={test_ratio:.2f}")
-
-        total_len = len(self.X_raw_aligned)
-        train_end_idx = int(total_len * train_ratio)
-        val_end_idx = int(total_len * (train_ratio + val_ratio))
-
-        # Split features
-        self.X_train_raw = self.X_raw_aligned.iloc[:train_end_idx]
-        self.X_val_raw = self.X_raw_aligned.iloc[train_end_idx:val_end_idx]
-        self.X_test_raw = self.X_raw_aligned.iloc[val_end_idx:]
-
-        # Split targets
-        self.y_train = self.y_aligned.iloc[:train_end_idx]
-        self.y_val = self.y_aligned.iloc[train_end_idx:val_end_idx]
-        self.y_test = self.y_aligned.iloc[val_end_idx:]
-        
-        # Split original engineered dataframe to keep original columns for backtesting/plotting
-        self.df_train_original = self.df_engineered_full.iloc[:train_end_idx]
-        self.df_val_original = self.df_engineered_full.iloc[train_end_idx:val_end_idx]
-        self.df_test_original = self.df_engineered_full.iloc[val_end_idx:]
-
-        # Keep separate handle to direction target for training feature selector
-        self.y_dir_train = self.y_dir_aligned.iloc[:train_end_idx]
-
-        logging.info(f"Data split complete:")
-        logging.info(f"  Train: X={self.X_train_raw.shape}, y={self.y_train.shape} ({self.X_train_raw.index.min()} to {self.X_train_raw.index.max()})")
-        logging.info(f"  Val:   X={self.X_val_raw.shape}, y={self.y_val.shape} ({self.X_val_raw.index.min()} to {self.X_val_raw.index.max()})")
-        logging.info(f"  Test:  X={self.X_test_raw.shape}, y={self.y_test.shape} ({self.X_test_raw.index.min()} to {self.X_test_raw.index.max()})")
-
-        if len(self.X_train_raw) == 0 or len(self.X_val_raw) == 0 or len(self.X_test_raw) == 0:
-            logging.error("One or more data splits are empty. Check data length and split ratios. Aborting.")
-            sys.exit(1)
-
-    def select_and_prune_features(self):
-        """Performs feature selection (e.g., VIF, L1) on RAW/ENGINEERED data 
-           and then prunes the SCALED data splits based on the selection."""
-        logging.info("--- Stage: Selecting Features (on Raw/Engineered Data) and Pruning Scaled Data ---")
-        
-        # --- MODIFIED: Input for SELECTION is now X_*_raw --- #
-        if self.X_train_raw is None or self.y_dir_train is None:
-             logging.error("Raw training data (X_train_raw, y_dir_train) not available for feature selection.")
-             sys.exit(1)
-        # --- End Modification --- #
-        
-        # Perform feature selection using the RAW/ENGINEERED training set
-        # FeatureEngineer.select_features handles imputation if needed
-        # Note: VIF should ideally run on unscaled data.
-        logging.info("Selecting features based on raw/engineered training data...")
-        self.final_whitelist = self.feature_engineer.select_features(
-            self.X_train_raw, # Use raw/engineered data for selection
-            self.y_dir_train,
+        # Call the stage function, passing necessary state stored on self
+        # Note: This function raises SystemExit on failure, so no need for explicit return check here
+        # --- Updated call to capture split fwd_ret and eps --- #
+        (
+            X_train_raw, X_val_raw, X_test_raw,
+            y_train, y_val, y_test,
+            df_train_original, df_val_original, df_test_original, 
+            y_dir_train_ordinal,
+            fwd_ret_train, fwd_ret_val,
+            eps_train, eps_val,
+            y_dir_val_ordinal # <<< ADDED
+        ) = split_data_fold(
+            df_labeled_aligned=df_labeled_aligned_fold,
+            fwd_returns_aligned=self.fwd_returns_aligned, # Pass the aligned series
+            eps_aligned=self.eps_aligned,              # Pass the aligned series
+            config=self.config,
+            target_columns=self.target_columns, 
+            target_dir_col=self.target_dir_col, 
+            fold_dates=fold_dates,
+            current_fold=self.current_fold
         )
-
-        # --- Save the final whitelist using IOManager (V3 Output Contract) --- #
-        if self.io:
-            try:
-                # Note: IOManager save_json doesn't directly support indent, saves minified
-                self.io.save_json(
-                    self.final_whitelist, 
-                    f'final_whitelist', # Name for IOManager path construction
-                    section='models', # Save under models/<run_id>/
-                    # suffix=f'_{self.run_id}.json' # Suffix is auto-added by IOManager if needed
-                )
-                logging.info(f"Saved final feature whitelist ({len(self.final_whitelist)} features) via IOManager to models/{self.run_id}/final_whitelist.json")
-            except Exception as e:
-                logging.error(f"Failed to save final feature whitelist using IOManager: {e}", exc_info=True)
-        else:
-            logging.warning("IOManager not available, attempting manual save of final_whitelist.json")
-            # Fallback to original manual save if IOManager is not present
-            whitelist_save_path = os.path.join(self.current_run_models_dir, f'final_whitelist_{self.run_id}.json')
-            try:
-                with open(whitelist_save_path, 'w') as f:
-                    json.dump(self.final_whitelist, f, indent=4)
-                logging.info(f"Saved final feature whitelist ({len(self.final_whitelist)} features) manually to {whitelist_save_path}")
-            except Exception as e:
-                logging.error(f"Manual save of final feature whitelist failed: {e}", exc_info=True)
-        # --- End Save Update --- #
-        
-        # --- MODIFIED: Prune the SCALED data splits using the determined whitelist --- #
-        if self.X_train_scaled is None or self.X_val_scaled is None or self.X_test_scaled is None:
-            logging.error("Scaled data splits not available for pruning.")
-            sys.exit(1)
-
-        logging.info(f"Pruning SCALED feature sets using final whitelist ({len(self.final_whitelist)} features): {self.final_whitelist}")
-        self.X_train_pruned = self.feature_engineer.prune_features(self.X_train_scaled, self.final_whitelist)
-        self.X_val_pruned = self.feature_engineer.prune_features(self.X_val_scaled, self.final_whitelist)
-        self.X_test_pruned = self.feature_engineer.prune_features(self.X_test_scaled, self.final_whitelist)
-        # --- End Modification --- #
-
-        logging.info(f"Feature shapes after pruning scaled data: Train={self.X_train_pruned.shape}, Val={self.X_val_pruned.shape}, Test={self.X_test_pruned.shape}")
-
-        # Verification and empty checks remain the same, using X_*_pruned
-        if not (self.X_train_pruned.columns.equals(self.X_val_pruned.columns) and 
-                self.X_train_pruned.columns.equals(self.X_test_pruned.columns)):
-            logging.error("Column mismatch between pruned data splits. Check pruning logic.")
-            sys.exit(1)
-            
-        if self.X_train_pruned.empty or self.X_val_pruned.empty or self.X_test_pruned.empty:
-             logging.error("One or more feature splits are empty after pruning. Exiting.")
-             sys.exit(1)
-
-    def scale_features(self):
-        """Scales features using StandardScaler fitted on the training set."""
-        logging.info("--- Stage: Scaling Features ---")
-        # --- Input remains X_*_raw --- # 
-        if self.X_train_raw is None or self.X_val_raw is None or self.X_test_raw is None:
-            logging.error("Raw feature sets (X_train_raw, etc.) not available for scaling.")
-            sys.exit(1)
-        # --- End Input --- #
-
-        # Scaler saving path remains the same
-        # scaler_path = os.path.join(self.current_run_models_dir, f'feature_scaler_{self.run_id}.joblib')
-        
-        # Ensure we only scale numeric columns from the RAW training data
-        numeric_cols = self.X_train_raw.select_dtypes(include=np.number).columns
-        if len(numeric_cols) < self.X_train_raw.shape[1]:
-            non_numeric_cols = self.X_train_raw.select_dtypes(exclude=np.number).columns
-            logging.warning(f"Non-numeric columns detected in raw features: {non_numeric_cols.tolist()}. These will not be scaled.")
-        
-        if not numeric_cols.empty:
-            # Check if scaler was loaded previously (when loading GRU - this logic needs adjustment)
-            # If loading GRU, the scaler should have been loaded *before* this step in execute()
-            if self.scaler is None: 
-                # This path is taken when train_gru=True OR if loading GRU failed to load scaler (which now errors earlier)
-                logging.info("Fitting StandardScaler on raw training data (numeric columns only)...")
-                self.scaler = StandardScaler()
-                self.scaler.fit(self.X_train_raw[numeric_cols])
-                
-                # Save the fitted scaler
-                scaler_save_path = os.path.join(self.current_run_models_dir, f'feature_scaler_{self.run_id}.joblib')
-                try:
-                    joblib.dump(self.scaler, scaler_save_path)
-                    logging.info(f"Feature scaler saved to {scaler_save_path}")
-                except Exception as e:
-                     logging.error(f"Failed to save feature scaler: {e}")
-            else:
-                # This path is taken if a scaler was successfully loaded when loading a GRU model
-                logging.info("Using pre-loaded scaler for feature scaling.")
-
-            # Apply scaling to all splits (numeric columns only)
-            # Create copies to store scaled data
-            self.X_train_scaled = self.X_train_raw.copy()
-            self.X_val_scaled = self.X_val_raw.copy()
-            self.X_test_scaled = self.X_test_raw.copy()
-
-            self.X_train_scaled[numeric_cols] = self.scaler.transform(self.X_train_raw[numeric_cols])
-            self.X_val_scaled[numeric_cols] = self.scaler.transform(self.X_val_raw[numeric_cols])
-            self.X_test_scaled[numeric_cols] = self.scaler.transform(self.X_test_raw[numeric_cols])
-            logging.info("Features scaled successfully.")
-        else:
-             logging.warning("No numeric columns found to scale. Skipping scaling step.")
-             # If no numeric columns, the scaled data is the same as the raw data
-             self.X_train_scaled = self.X_train_raw
-             self.X_val_scaled = self.X_val_raw
-             self.X_test_scaled = self.X_test_raw
-             
-        # --- Remove assignment to X_*_pruned --- #
-        # Scaled data is now stored in X_*_scaled, pruning happens next.
-
-    def run_baseline_checks(self):
-        """Runs baseline Logistic Regression check on selected, scaled validation data."""
-        logging.info("--- Stage: Baseline Checks (Logistic Regression) ---")
-        
-        # Skip if ternary
-        if self.use_ternary:
-            logging.warning("Using ternary labels. Skipping binary Logistic Regression baseline check.")
-            return
-        
-        # --- MODIFIED: Input is now X_*_pruned (which is selected AND scaled) --- #
-        if self.X_train_pruned is None or self.y_train is None or \
-           self.X_val_pruned is None or self.y_val is None:
-            logging.error("Pruned/Scaled features or targets not available for baseline check. Skipping.")
-            return
-        # --- End Modification --- #
-
-        horizon = self.config['gru'].get('prediction_horizon', 5)
-        # Get the correct binary direction label column name
-        target_dir_col = f'direction_label_{horizon}' 
-
-        if target_dir_col not in self.y_train.columns or target_dir_col not in self.y_val.columns:
-             logging.error(f"Target direction column '{target_dir_col}' not found in y_train/y_val. Skipping baseline.")
-             return
-             
-        y_train_dir = self.y_train[target_dir_col]
-        y_val_dir = self.y_val[target_dir_col]
-
-        # --- Use BaselineChecker --- #
-        try:
-            # Run the baseline check using the checker
-            baseline_report = self.baseline_checker.run_logistic_baseline(
-                X_train_pruned=self.X_train_pruned, 
-                y_train_dir=y_train_dir, 
-                X_val_pruned=self.X_val_pruned, 
-                y_val_dir=y_val_dir
-            )
-
-            # --- Save Baseline Report (V3 Output Contract) --- #
-            if self.io:
-                try:
-                    self.io.save_json(
-                        baseline_report, 
-                        "baseline1_report", # As per revisions.txt
-                        section='results', 
-                        use_txt=True # Save as .txt
-                    )
-                    logging.info("Saved baseline1_report.txt")
-                except Exception as e:
-                    logging.error(f"Failed to save baseline1_report using IOManager: {e}")
-            else:
-                logging.warning("IOManager not available, skipping saving of baseline1_report.txt")
-            # --- End Save --- #
-
-            # --- Success Criteria Check (V3) --- #
-            ci_lower_bound = baseline_report.get("ci_lower_bound")
-            required_ci_lb = 0.52 # From revisions.txt
-
-            if ci_lower_bound is None or np.isnan(ci_lower_bound):
-                logging.error("Baseline check FAILED: Could not determine CI lower bound. Aborting.")
-                print(f"\n{'*'*80}\nBASELINE CHECK FAILED: CI lower bound is NaN.\nAborting pipeline.\n{'*'*80}\n")
-                sys.exit("Baseline CI lower bound calculation failed.")
-            elif ci_lower_bound < required_ci_lb:
-                error_msg = f"BASELINE CHECK FAILED: Logistic Regression 95% CI lower bound ({ci_lower_bound:.3f}) is below {required_ci_lb} threshold."
-                logging.error(error_msg)
-                print(f"\n{'*'*80}\n{error_msg}\nConsider revising features or data.\nAborting pipeline.\n{'*'*80}\n")
-                sys.exit(f"Baseline edge too low (< {required_ci_lb} CI lower). Aborting pipeline.")
-            else:
-                success_msg = f"Baseline check passed! Logistic hit-rate 95%-CI lower bound: {ci_lower_bound:.3f} (>= {required_ci_lb})"
-                logging.info(success_msg)
-                print(f"\n{'='*80}\n{success_msg}\nProceeding with pipeline.\n{'='*80}\n")
-            # --- End Success Criteria Check --- #
-
-        except Exception as e:
-            logging.error(f"An error occurred during baseline checks: {e}", exc_info=True)
-            # Decide if this should halt the pipeline
-            # For now, log the error and continue, but the CI check might have failed earlier
-
-        # --- Original baseline logic removed --- #
-
-    def create_sequences(self):
-        """Creates sequences for GRU input using selected, scaled features."""
-        logging.info("--- Stage: Creating Sequences ---")
-        # --- MODIFIED: Input is now X_*_pruned (which is selected AND scaled) --- #
-        if self.X_train_pruned is None or self.y_train is None or \
-           self.X_val_pruned is None or self.y_val is None or \
-           self.X_test_pruned is None or self.y_test is None:
-             logging.error("Selected/Scaled features or targets not available for sequence creation.")
-             sys.exit(1)
-        # --- End Modification --- #
-
-        lookback = self.config['gru'].get('lookback', 60)
-        horizon = self.config['gru'].get('prediction_horizon', 5)
-        target_ret_col = f'fwd_log_ret_{horizon}'
-        target_dir_col = f'direction_label3_{horizon}' if self.use_ternary else f'direction_label_{horizon}'
-        
-        logging.info(f"Creating sequences with lookback={lookback}")
-
-        # Helper function remains the same, but gets X_*_pruned as input
-        def _create_sequences_helper(features_pruned_df, targets_df, lookback, ret_col, dir_col):
-            # Convert DataFrames to numpy arrays for efficiency
-            features_np = features_pruned_df.values # Input is already pruned+scaled
-            # ... (rest of helper remains the same) ...
-            y_ret_np = targets_df[ret_col].values
-            if targets_df[dir_col].dtype == 'object':
-                y_dir_np = np.stack(targets_df[dir_col].values)
-            else:
-                y_dir_np = targets_df[dir_col].values
-            
-            X, y_ret_seq, y_dir_seq = [], [], []
-            target_indices = [] 
-            for i in range(lookback, len(features_np)):
-                X.append(features_np[i-lookback : i])
-                y_ret_seq.append(y_ret_np[i]) 
-                y_dir_seq.append(y_dir_np[i])
-                target_indices.append(targets_df.index[i])
-            if not X: return None, None, None, None
-            X_np = np.array(X)
-            y_ret_seq_np = np.array(y_ret_seq)
-            y_dir_seq_np = np.array(y_dir_seq)
-            target_indices_pd = pd.Index(target_indices)
-            return X_np, y_ret_seq_np, y_dir_seq_np, target_indices_pd
-
-        # Create sequences using X_*_pruned (which are now the final scaled+selected features)
-        self.X_train_seq, y_ret_train_seq, y_dir_train_seq, self.train_indices = _create_sequences_helper(
-            self.X_train_pruned, self.y_train, lookback, target_ret_col, target_dir_col
-        )
-        self.X_val_seq, y_ret_val_seq, y_dir_val_seq, self.val_indices = _create_sequences_helper(
-            self.X_val_pruned, self.y_val, lookback, target_ret_col, target_dir_col
-        )
-        self.X_test_seq, y_ret_test_seq, y_dir_test_seq, self.test_indices = _create_sequences_helper(
-            self.X_test_pruned, self.y_test, lookback, target_ret_col, target_dir_col
-        )
-
-        # Checks and target dict creation remain the same
-        # ... (rest of function) ...
-        if self.X_train_seq is None or self.X_val_seq is None:
-             logger.error(f"Sequence creation resulted in empty train or val arrays. Check lookback ({lookback}) vs split sizes. Aborting.")
-             sys.exit(1)
-        logging.info(f"Sequence shapes created:")
-        logging.info(f"  Train: X={self.X_train_seq.shape}, y_ret={y_ret_train_seq.shape}, y_dir={y_dir_train_seq.shape}")
-        logging.info(f"  Val:   X={self.X_val_seq.shape}, y_ret={y_ret_val_seq.shape}, y_dir={y_dir_val_seq.shape}")
-        logging.info(f"  Test:  X={self.X_test_seq.shape if self.X_test_seq is not None else 'None'}, ...") # Shortened log
-        dir_key = "dir3" if self.use_ternary else "dir"
-        self.y_train_seq_dict = {"ret": y_ret_train_seq, "gauss_params": y_ret_train_seq, dir_key: y_dir_train_seq}
-        self.y_val_seq_dict = {"ret": y_ret_val_seq, "gauss_params": y_ret_val_seq, dir_key: y_dir_val_seq}
-        if y_ret_test_seq is not None and y_dir_test_seq is not None:
-             self.y_test_seq_dict = {"ret": y_ret_test_seq, "gauss_params": y_ret_test_seq, dir_key: y_dir_test_seq}
-        else:
-             self.y_test_seq_dict = None
-             logging.warning("Test sequences or targets could not be created. Backtesting might fail.")
-
-    def train_or_load_gru(self):
-        """Trains a new GRU model or loads a pre-trained one using GRUModelHandler."""
-        logging.info("--- Stage: Training or Loading GRU Model ---")
-        gru_cfg = self.config['gru']
-        train_gru_flag = self.config['control'].get('train_gru', False)
-
-        if train_gru_flag:
-            logging.info(f"Attempting to train a new GRU model for run {self.run_id}...")
-            if self.X_train_seq is None or self.y_train_seq_dict is None or \
-               self.X_val_seq is None or self.y_val_seq_dict is None:
-                logging.error("Sequence data (train/val) not available for GRU training. Exiting.")
-                sys.exit(1)
-
-            # Check if hyperparameter sweep is enabled
-            sweep_enabled = self.config.get('hyperparameter_tuning', {}).get('gru', {}).get('sweep_enabled', False)
-            
-            if sweep_enabled:
-                logging.info("Hyperparameter sweep enabled. Running Optuna optimization.")
-                
-                try:
-                    # Import the GRUHyperTuner
-                    from gru_sac_predictor.src.gru_hyper_tuner import GRUHyperTuner
-                    
-                    # Create fold directory for tuning results
-                    fold_tuning_dir = os.path.join(self.current_run_models_dir, "hypertuning")
-                    os.makedirs(fold_tuning_dir, exist_ok=True)
-                    
-                    # Initialize hyperparameter tuner
-                    tuner = GRUHyperTuner(self.config, fold_tuning_dir)
-                    
-                    # Run optimization
-                    best_params = tuner.optimize(
-                        X_train=self.X_train_seq,
-                        y_train_dict=self.y_train_seq_dict,
-                        X_val=self.X_val_seq,
-                        y_val_dict=self.y_val_seq_dict
-                    )
-                    
-                    # Train final model with best parameters
-                    model_handler, history = tuner.train_with_best_params(
-                        X_train=self.X_train_seq,
-                        y_train_dict=self.y_train_seq_dict,
-                        X_val=self.X_val_seq,
-                        y_val_dict=self.y_val_seq_dict
-                    )
-                    
-                    if model_handler is not None and model_handler.model is not None:
-                        self.gru_model = model_handler.model
-                        self.gru_handler = model_handler  # Replace handler with tuned one
-                        self.gru_model_run_id_loaded_from = self.run_id
-                        logging.info("Successfully trained GRU model with optimized hyperparameters.")
-                        
-                        # Save best parameters to the main run directory
-                        best_params_path = os.path.join(self.current_run_models_dir, f'best_gru_params_{self.run_id}.json')
-                        with open(best_params_path, 'w') as f:
-                            json.dump(best_params, f, indent=4)
-                        logging.info(f"Saved best hyperparameters to {best_params_path}")
-                    else:
-                        logging.error("Failed to train GRU model with optimized hyperparameters. Falling back to default parameters.")
-                        # Fall back to default training
-                        sweep_enabled = False
-                        
-                except ImportError:
-                    logging.error("Failed to import GRUHyperTuner. Make sure Optuna is installed. Falling back to default parameters.")
-                    sweep_enabled = False
-                except Exception as e:
-                    logging.error(f"Hyperparameter optimization failed: {str(e)}. Falling back to default parameters.")
-                    sweep_enabled = False
-            
-            # If sweep is not enabled or failed, train with default parameters
-            if not sweep_enabled:
-                # Get parameters from config
-                lookback = gru_cfg.get('lookback', 60)
-                # Get feature count from scaled data (use shape[2] for sequences)
-                n_features = self.X_train_seq.shape[2]
-                epochs = gru_cfg.get('epochs', 25)
-                batch_size = gru_cfg.get('batch_size', 128)
-                patience = gru_cfg.get('patience', 5) # Use gru patience from config
-
-                # Train the model
-                self.gru_model, history = self.gru_handler.train(
-                    X_train=self.X_train_seq,
-                    y_train_dict=self.y_train_seq_dict,
-                    X_val=self.X_val_seq,
-                    y_val_dict=self.y_val_seq_dict,
-                    lookback=lookback,
-                    n_features=n_features,
-                    max_epochs=epochs,
-                    batch_size=batch_size,
-                    patience=patience
-                )
-
-                if self.gru_model is None:
-                    logging.error("GRU model training failed. Exiting.")
-                    sys.exit(1)
-                else:
-                    # Save the newly trained model
-                    saved_path = self.gru_handler.save() # Uses run_id from handler
-                    if saved_path:
-                        logging.info(f"Newly trained GRU model saved to {saved_path}")
-                    else:
-                        logging.warning("Failed to save the newly trained GRU model.")
-                    # Set the loaded ID to the current run ID
-                    self.gru_model_run_id_loaded_from = self.run_id
-                    logging.info(f"Using GRU model trained in current run: {self.run_id}")
-                    
-                    # --- V3 Output Contract: Plot Learning Curve --- #
-                    if self.io and history is not None and self.config.get('control', {}).get('generate_plots', True):
-                         # Infer log dir path based on current models dir
-                        log_dir = os.path.dirname(self.current_run_models_dir).replace('/models', '/logs')
-                        csv_log_path = os.path.join(log_dir, 'gru_history.csv')
-                        if os.path.exists(csv_log_path):
-                            logging.info(f"Plotting learning curve from {csv_log_path}...")
-                            try:
-                                history_df = pd.read_csv(csv_log_path)
-                                
-                                # Determine metric keys (handle v2 vs v3 differences if necessary)
-                                loss_key = 'loss'
-                                val_loss_key = 'val_loss'
-                                acc_key = None
-                                val_acc_key = None
-                                if 'dir3_accuracy' in history_df.columns: # V3 specific?
-                                    acc_key = 'dir3_accuracy' 
-                                    val_acc_key = 'val_dir3_accuracy'
-                                elif 'accuracy' in history_df.columns: # V2 or other?
-                                    acc_key = 'accuracy'
-                                    val_acc_key = 'val_accuracy'
-                                
-                                if acc_key is None:
-                                     logging.warning("Could not find a suitable accuracy metric in history CSV for plotting.")
-                                     n_panes = 1 # Only plot loss
-                                else:
-                                     n_panes = 2 # Plot loss and accuracy
-
-                                # Get figure settings
-                                fig_dpi = self.config.get('output', {}).get('figure_dpi', 150)
-                                fig_size = self.config.get('output', {}).get('figure_size', [16, 9])
-                                footer_text = "© GRU-SAC v3"
-                                
-                                plt.style.use('seaborn-v0_8-darkgrid')
-                                # Adjust figsize height based on panes
-                                adjusted_fig_height = fig_size[1] * (n_panes / 3.0) # Rough scaling
-                                fig, axes = plt.subplots(n_panes, 1, figsize=(fig_size[0], adjusted_fig_height), sharex=True)
-                                
-                                if n_panes == 1:
-                                     ax_loss = axes # Single axis
-                                else:
-                                     ax_loss, ax_acc = axes # Multiple axes
-
-                                epochs = history_df['epoch'] + 1 # epochs are 0-indexed in csv
-                                
-                                # Pane 1: Loss (Log Scale)
-                                ax_loss.plot(epochs, history_df[loss_key], label='Training Loss')
-                                ax_loss.plot(epochs, history_df[val_loss_key], label='Validation Loss')
-                                ax_loss.set_yscale('log')
-                                ax_loss.set_ylabel('Loss (Log Scale)')
-                                ax_loss.legend()
-                                ax_loss.set_title('GRU Model Training Progress', fontsize=16)
-                                ax_loss.grid(True, which="both", ls="--", linewidth=0.5)
-
-                                # Pane 2: Accuracy (if available)
-                                if n_panes == 2:
-                                    ax_acc.plot(epochs, history_df[acc_key], label=f'Training {acc_key}')
-                                    ax_acc.plot(epochs, history_df[val_acc_key], label=f'Validation {val_acc_key}')
-                                    ax_acc.set_ylabel('Accuracy')
-                                    ax_acc.set_xlabel('Epoch')
-                                    ax_acc.legend()
-                                    ax_acc.grid(True, which="both", ls="--", linewidth=0.5)
-                                else:
-                                     # If only loss pane, set xlabel there
-                                     ax_loss.set_xlabel('Epoch')
-
-                                # Add vertical line for early stopping epoch if available
-                                if hasattr(history, 'epoch') and len(history.epoch) > 0:
-                                     # Early stopping epoch is the number of epochs run
-                                     early_stop_epoch = len(history.epoch) 
-                                     if early_stop_epoch < max_epochs: # Only draw if early stopping occurred
-                                         for ax in fig.axes:
-                                             ax.axvline(x=early_stop_epoch, color='r', linestyle='--', linewidth=1, label=f'Early Stop @ {early_stop_epoch}')
-                                         # Add legend entry to the last plot
-                                         fig.axes[-1].legend()
-
-                                # Add footer
-                                plt.figtext(0.99, 0.01, footer_text, horizontalalignment='right', 
-                                            verticalalignment='bottom', fontsize=8, color='gray')
-
-                                plt.tight_layout(rect=[0, 0.03, 1, 0.97]) # Adjust layout
-
-                                # Save figure using IOManager
-                                self.io.save_figure(fig, "gru_learning_curve", section='results')
-                                logging.info("GRU learning curve plot saved.")
-                                plt.close(fig)
-                                
-                            except FileNotFoundError:
-                                 logging.warning(f"GRU history file not found at {csv_log_path}. Cannot plot learning curve.")
-                            except Exception as e:
-                                logging.error(f"Failed to plot GRU learning curve: {e}", exc_info=True)
-                        else:
-                             logging.warning(f"GRU history file not found at {csv_log_path}. Cannot plot learning curve.")
-                    elif not self.io:
-                        logging.warning("IOManager not available, skipping GRU learning curve plot.")
-                        # --- End Plot Learning Curve --- #
-
-        else: # Load pre-trained GRU model
-            load_run_id = gru_cfg.get('model_load_run_id', None)
-            if not load_run_id:
-                logging.error("train_gru is False, but no gru.model_load_run_id specified in config. Exiting.")
-                sys.exit(1)
-            
-            logging.info(f"Attempting to load pre-trained GRU model from run ID: {load_run_id}")
-            # Construct the expected path using the base models directory
-            model_filename = f'gru_model_{load_run_id}.keras' # Assuming .keras extension
-            model_path = os.path.join(self.base_models_dir_path, f'run_{load_run_id}', model_filename)
-            
-            # Load the model using the handler
-            self.gru_model = self.gru_handler.load(model_path)
-
-            if self.gru_model is None:
-                logging.error(f"Failed to load GRU model from path: {model_path}. Exiting.")
-                sys.exit(1)
-            else:
-                self.gru_model_run_id_loaded_from = load_run_id
-                logging.info(f"Successfully loaded GRU model from run: {load_run_id}")
-                
-                # --- Try loading associated scaler --- #
-                scaler_filename = f'feature_scaler_{load_run_id}.joblib'
-                # Adjust path: load from the specific run ID's model folder
-                scaler_load_path = os.path.join(self.base_models_dir_path, load_run_id, scaler_filename)
-                logging.info(f"Attempting to load associated scaler from: {scaler_load_path}")
-                if os.path.exists(scaler_load_path):
-                    try:
-                        self.scaler = joblib.load(scaler_load_path)
-                        logging.info("Associated feature scaler loaded successfully.")
-                        # --- Re-apply scaling using the loaded scaler --- #
-                        numeric_cols = self.X_train_pruned.select_dtypes(include=np.number).columns
-                        if not numeric_cols.empty:
-                            # Important: Scale the _pruned data before sequence creation if scaler is loaded
-                            logging.info("Re-scaling features using loaded scaler...")
-                            self.X_train_scaled = self.X_train_pruned.copy()
-                            self.X_val_scaled = self.X_val_pruned.copy()
-                            self.X_test_scaled = self.X_test_pruned.copy()
-                            self.X_train_scaled[numeric_cols] = self.scaler.transform(self.X_train_pruned[numeric_cols])
-                            self.X_val_scaled[numeric_cols] = self.scaler.transform(self.X_val_pruned[numeric_cols])
-                            self.X_test_scaled[numeric_cols] = self.scaler.transform(self.X_test_pruned[numeric_cols])
-                            logging.info("Features re-scaled successfully.")
-                            # Need to recreate sequences after re-scaling!
-                            self.create_sequences() 
-                        else:
-                            logging.warning("Loaded scaler, but no numeric columns found in pruned data to re-scale.")
-                    except Exception as e:
-                        logging.error(f"Failed to load or apply associated scaler: {e}. Scaling might be inconsistent. Exiting.")
-                        raise RuntimeError(f"Failed to load or apply scaler '{scaler_load_path}'") from e # Step 1-C: Raise error
-                else:
-                    # --- Raise error if scaler missing (Step 1-C) --- #
-                    logging.error(f"Associated feature scaler not found at {scaler_load_path} for run {load_run_id}. Cannot proceed. Exiting.")
-                    raise RuntimeError(f"Feature scaler '{scaler_filename}' not found for run {load_run_id} at {scaler_load_path}")
-                # --- End Scaler Loading/Applying --- #
-
-        # Final check: Ensure a GRU model is loaded/trained
-        if self.gru_model is None:
-            logging.error("No GRU model is available after train/load step. Exiting.")
-            sys.exit(1)
-
-    def calibrate_probabilities(self):
-        """Calibrates GRU output probabilities and runs validation checks for the current fold."""
-        logger.info(f"--- Fold {self.current_fold}: Stage: Calibrating Probabilities & GRU Validation ---")
-        
-        # Ensure GRU model and fold's validation data are available
-        if self.gru_model is None or self.X_val_seq is None or self.y_val_seq_dict is None:
-             logger.error(f"Fold {self.current_fold}: GRU model or validation sequence data not available. Skipping calibration.")
-             # Store None for calibrated probs to prevent downstream errors
-             self.p_cal_val = None 
-             self.optimal_T = None
-             self.vector_cal_params = None
-             return # Skip the rest of the method
-             
-        p_cal_val_to_check = None
-        y_dir_val_to_check = None
-        calibration_method = self.config.get('calibration', {}).get('method', 'temperature')
-        is_ternary_check = self.use_ternary # Use cached flag
-        dir_key = "dir3" if is_ternary_check else "dir"
-
-        # Local variables to store intermediate results needed for validation check
-        local_p_raw_val = None
-        local_y_dir_val_temp = None
-        local_dir3_logits_val = None
-        local_y_dir3_val_onehot = None
-        
-        # --- Define Fold-Specific Paths --- # 
-        # Use IOManager if available, otherwise fallback to manual path construction
-        fold_models_dir = self.fold_dirs.get('models')
-        if not fold_models_dir:
-            logger.warning(f"Fold {self.current_fold}: Fold-specific models directory not found. Saving calibration params to main run dir.")
-            fold_models_dir = self.current_run_models_dir # Fallback
-            
-        # --- Vector Scaling --- #
-        if calibration_method == 'vector' and is_ternary_check:
-            if not self.vector_calibrator:
-                 logger.error(f"Fold {self.current_fold}: VectorCalibrator not available. Cannot perform vector calibration.")
-                 return
-            try:
-                 local_dir3_logits_val = self.gru_handler.predict_logits(self.X_val_seq)
-                 if local_dir3_logits_val is None: raise ValueError("Failed to get logits")
-                 local_y_dir3_val_onehot = self.y_val_seq_dict.get(dir_key)
-                 if local_y_dir3_val_onehot is None: raise ValueError("Missing 'dir3' in y_val_seq_dict")
-                 
-                 # --- Fit on current fold's validation data --- #
-                 logger.info(f"Fold {self.current_fold}: Fitting Vector Scaling parameters...")
-                 self.vector_calibrator.fit(local_dir3_logits_val, local_y_dir3_val_onehot)
-                 self.vector_cal_params = self.vector_calibrator.optimal_params # Store params on self
-                 
-                 if self.vector_cal_params:
-                    p_cal_val_to_check = self.vector_calibrator.calibrate(local_dir3_logits_val)
-                    y_dir_val_to_check = local_y_dir3_val_onehot
-                    # Save params for current fold
-                    params_save_filename = f'calibration_vector_fold_{self.current_fold}.npy'
-                    params_save_path = os.path.join(fold_models_dir, params_save_filename)
-                    self.vector_calibrator.save_params(params_save_path)
-                    logger.info(f"Fold {self.current_fold}: Saved vector calibration params to {params_save_path}")
-                 else:
-                    logger.warning(f"Fold {self.current_fold}: Vector calibration parameters not found after fitting. Cannot perform validation.")
-                    self.vector_cal_params = None # Ensure it's None
-            except Exception as e:
-                 logger.error(f"Fold {self.current_fold}: Error during vector calibration: {e}", exc_info=True)
-                 self.vector_cal_params = None
-
-        # --- Temperature Scaling --- #
-        elif calibration_method == 'temperature' and not is_ternary_check:
-            try:
-                 predictions_val = self.gru_handler.predict(self.X_val_seq)
-                 if predictions_val is None or len(predictions_val) < 3: raise ValueError("Failed to get predictions")
-                 local_p_raw_val = predictions_val[2].flatten() # Assuming 3rd output is binary prob
-                 local_y_dir_val_temp = self.y_val_seq_dict.get(dir_key)
-                 if local_y_dir_val_temp is None: raise ValueError("Missing 'dir' in y_val_seq_dict")
-                 if len(local_p_raw_val) != len(local_y_dir_val_temp): raise ValueError("Length mismatch")
-                 
-                 # --- Fit on current fold's validation data --- #
-                 logger.info(f"Fold {self.current_fold}: Fitting Temperature Scaling parameter...")
-                 self.optimal_T = self.calibrator.optimise_temperature(local_p_raw_val, local_y_dir_val_temp)
-                 self.calibrator.optimal_T = self.optimal_T # Update calibrator instance
-                 
-                 if self.optimal_T is not None:
-                      p_cal_val_to_check = self.calibrator.calibrate(local_p_raw_val)
-                      y_dir_val_to_check = local_y_dir_val_temp
-                      # Save temp for current fold
-                      temp_save_filename = f'calibration_temp_fold_{self.current_fold}.npy'
-                      temp_save_path = os.path.join(fold_models_dir, temp_save_filename)
-                      np.save(temp_save_path, self.optimal_T)
-                      logger.info(f"Fold {self.current_fold}: Saved optimal temperature T={self.optimal_T:.4f} to {temp_save_path}")
-                 else:
-                      logger.warning(f"Fold {self.current_fold}: Optimal temperature not found after fitting. Cannot perform validation.")
-                      self.optimal_T = None # Ensure it's None
-            except Exception as e:
-                 logger.error(f"Fold {self.current_fold}: Error during temperature calibration: {e}", exc_info=True)
-                 self.optimal_T = None
-        else: # Covers cases where calibration method is wrong or mismatch with ternary state
-            logger.warning(f"Fold {self.current_fold}: Calibration method '{calibration_method}' or ternary state mismatch ({is_ternary_check}). Skipping GRU validation checks.")
-            self.optimal_T = None
-            self.vector_cal_params = None
-
-        # --- Optimize Edge Threshold (after potential calibration) --- #
-        optimize_edge = self.config.get('calibration', {}).get('optimize_edge_threshold', False)
-        edge_thr_config = self.config.get('calibration', {}).get('edge_threshold', 0.1) # Default/fallback
-        self.optimized_edge_threshold = None # Reset for the fold
-
-        if optimize_edge:
-            logger.info(f"Optimizing edge threshold using Youden's J on validation predictions...")
-            try:
-                # Prepare y_true for optimization (needs binary 0/1)
-                if y_dir_val_to_check is None:
-                    raise ValueError("Cannot optimize edge threshold without valid y_dir_val.")
-                y_true_for_opt = None
-                p_cal_for_opt = None
-                if is_ternary_check:
-                    if p_cal_val_to_check is not None:
-                        # Convert ternary to binary: P(up) vs others
-                        p_cal_for_opt = p_cal_val_to_check[:, -1] # P(up)
-                        y_true_for_opt = (np.argmax(y_dir_val_to_check, axis=1) == 2).astype(int)
-                    else:
-                        raise ValueError("Cannot optimize ternary edge threshold without valid calibrated probabilities.")
-                else:
-                    # Binary case
-                    if p_cal_val_to_check is not None:
-                        p_cal_for_opt = p_cal_val_to_check
-                        y_true_for_opt = (np.asarray(y_dir_val_to_check) > 0.5).astype(int)
-                    else:
-                        raise ValueError("Cannot optimize binary edge threshold without valid calibrated probabilities.")
-
-                # Perform optimization using the dedicated function from metrics
-                # Note: This assumes Calibrator.optimize_edge_threshold was removed or is not used here
-                # Ensure _calculate_optimal_edge_threshold is imported
-                self.optimized_edge_threshold = _calculate_optimal_edge_threshold(y_true_for_opt, p_cal_for_opt)
-
-                if self.optimized_edge_threshold is not None:
-                    logger.info(f"Optimized edge threshold: {self.optimized_edge_threshold:.4f}")
-                    # Save optimized threshold
-                    thresh_file = f"optimized_edge_threshold_fold_{self.current_fold}.txt"
-                    try:
-                        # Ensure fold_results_dir is defined (should be available from context)
-                        # Assuming fold_results_dir is defined in the outer scope
-                        if self.io and 'fold_results_dir' in locals() and fold_results_dir:
-                            self.io.save_json({'optimized_edge_threshold': self.optimized_edge_threshold},
-                                                thresh_file.replace('.txt',''), # Use filename as key for io
-                                                base_dir=fold_results_dir, use_txt=True)
-                            logger.info(f"Saved optimized edge threshold to {os.path.join(fold_results_dir, thresh_file)}")
-                        elif self.io:
-                            # Fallback: Save to the main results directory if fold_results_dir isn't available
-                            # Construct the path for logging clarity
-                            fallback_path = os.path.join(self.io.get_section_path('results'), thresh_file)
-                            self.io.save_json({'optimized_edge_threshold': self.optimized_edge_threshold},
-                                                thresh_file.replace('.txt',''),
-                                                section='results', use_txt=True) # Fallback save
-                            logger.info(f"Saved optimized edge threshold to main results dir: {fallback_path}")
-                        else:
-                            logger.warning("IOManager not available, cannot save optimized threshold.")
-                    except NameError: # Specifically catch if fold_results_dir is not defined
-                        logger.warning("fold_results_dir not defined. Attempting fallback save to main results dir.")
-                        if self.io:
-                             fallback_path = os.path.join(self.io.get_section_path('results'), thresh_file)
-                             self.io.save_json({'optimized_edge_threshold': self.optimized_edge_threshold},
-                                                 thresh_file.replace('.txt',''),
-                                                 section='results', use_txt=True)
-                             logger.info(f"Saved optimized edge threshold to main results dir: {fallback_path}")
-                        else:
-                             logger.warning("IOManager not available, cannot save optimized threshold.")
-                    except Exception as e:
-                        logger.error(f"Failed to save optimized edge threshold: {e}")
-                else:
-                    logger.warning("Edge threshold optimization failed or was skipped. Using config default.")
-                    self.optimized_edge_threshold = edge_thr_config # Fallback
-
-            except Exception as e:
-                logger.error(f"Error during edge threshold optimization: {e}", exc_info=True)
-                self.optimized_edge_threshold = edge_thr_config # Fallback
-        else:
-            # If optimization is disabled, store the config threshold for consistent use
-            self.optimized_edge_threshold = edge_thr_config
-            logger.info(f"Using edge threshold from config: {self.optimized_edge_threshold}")
-        # --- End Optimize Edge Threshold --- #
-
-        # --- Perform GRU Validation Checks using the threshold stored in self.optimized_edge_threshold --- #
-        if p_cal_val_to_check is not None and y_dir_val_to_check is not None:
-            self._perform_gru_validation_checks(
-                 p_cal_val=p_cal_val_to_check,
-                 y_dir_val=y_dir_val_to_check,
-                 is_ternary=is_ternary_check
-            )
-        # Note: _perform_gru_validation_checks was already modified to use self.optimized_edge_threshold
-        else:
-            logger.warning("Could not perform GRU validation checks due to missing calibrated predictions or labels.")
-
-    # --- Helper for GRU Validation Checks (Replaces edge check) --- #
-    def _perform_gru_validation_checks(self, p_cal_val, y_dir_val, is_ternary):
-        """
-        Performs GRU validation checks: Edge-Filtered Accuracy and Brier Score.
-        Logs results and raises SystemExit if checks fail.
-
-        Args:
-            p_cal_val: Calibrated probabilities on validation set.
-                       For binary: (N,) shape, P(up).
-                       For ternary: (N, 3) shape, [P(down), P(flat), P(up)].
-            y_dir_val: True direction labels for validation set.
-                       For binary: (N,) shape, 0/1 (potentially soft).
-                       For ternary: (N, 3) shape, one-hot encoded.
-            is_ternary (bool): Flag indicating if ternary classification is used.
-        """
-        logger.info(f"--- Fold {self.current_fold}: Performing GRU Validation Checks --- ")
-
-        # --- Define thresholds (Consider moving to config) --- #
-        validation_criteria = self.config.get('validation_gates', {}).get('gru', {})
-        edge_check_thr = validation_criteria.get('edge_filtered_acc_ci_lower_threshold', 0.55)
-        brier_check_thr = validation_criteria.get('brier_score_threshold', 0.19)
-        min_edge_samples = validation_criteria.get('edge_filtered_min_samples', 30)
-        # --- End Thresholds --- #
-
-        # --- Determine Edge Threshold --- #
-        calib_config = self.config.get('calibration', {})
-        optimize_edge = calib_config.get('optimize_edge_threshold', False)
-        edge_thr_config = calib_config.get('edge_threshold', 0.1) # Default/fallback
-        self.fold_edge_threshold = edge_thr_config # Initialize with config value
-        
-        if optimize_edge and not is_ternary:
-            logger.info(f"Fold {self.current_fold}: Optimizing edge threshold using Youden's J...")
-            # Use binary y_dir_val (potentially hard labels) and p_cal_val
-            y_true_for_opt = (y_dir_val > 0.5).astype(int) if not np.all((y_dir_val == 0) | (y_dir_val == 1)) else y_dir_val.astype(int)
-            self.fold_edge_threshold = _calculate_optimal_edge_threshold(y_true_for_opt, p_cal_val)
-            logger.info(f"Fold {self.current_fold}: Using optimized edge threshold: {self.fold_edge_threshold:.4f}")
-            # Save the optimized threshold? Optional - could save to fold results
-            if self.io:
-                 fold_results_dir = self.fold_dirs.get('results')
-                 if fold_results_dir:
-                      self.io.save_json({'optimized_edge_threshold': self.fold_edge_threshold}, 
-                                           f'optimized_edge_threshold_fold_{self.current_fold}', 
-                                           base_dir=fold_results_dir, use_txt=True)
-            else:
-                      logger.warning(f"Fold {self.current_fold}: Could not save optimized edge threshold, results dir missing.")
-        elif optimize_edge and is_ternary:
-            logger.warning(f"Fold {self.current_fold}: Edge threshold optimization requested but not supported for ternary. Using config value: {edge_thr_config:.4f}")
-        else:
-            logger.info(f"Fold {self.current_fold}: Using fixed edge threshold from config: {edge_thr_config:.4f}")
-        # --- End Determine Edge Threshold --- #
-
-        # --- Edge-Filtered Accuracy Check --- # 
-        edge_accuracy = np.nan
-        n_filtered = 0
-        ci_lower = np.nan
-        passed_edge_acc = False # Default to False
-        
-        try: # Wrap calculation in try-except
-            if is_ternary:
-                # Use P(up) equivalent for binary check compatibility
-                p_up_equiv = p_cal_val[:, 2] 
-                y_true_binary_equiv = (np.argmax(y_dir_val, axis=1) == 2).astype(int)
-                logger.info(f"Fold {self.current_fold}: Performing edge-filtered accuracy check on ternary model using P(up) equivalent.")
-                edge_accuracy, n_filtered = edge_filtered_accuracy(
-                    y_true=y_true_binary_equiv,
-                    p_cal=p_up_equiv, 
-                    thr=self.fold_edge_threshold # Use the determined threshold
-                )
-            else:
-                # Binary case
-                edge_accuracy, n_filtered = edge_filtered_accuracy(
-                    y_true=y_dir_val,
-                    p_cal=p_cal_val, 
-                    thr=self.fold_edge_threshold # Use the determined threshold
-                )
-
-            if not np.isnan(edge_accuracy):
-                if n_filtered < min_edge_samples:
-                    logger.warning(f"Fold {self.current_fold}: Edge Acc Check: Insufficient samples ({n_filtered} < {min_edge_samples}) meeting edge >= {self.fold_edge_threshold:.2f} for reliable CI. Check considered FAIL.")
-                    passed_edge_acc = False # Fail if not enough samples
-                else:
-                    try:
-                        k_correct = int(round(edge_accuracy * n_filtered))
-                        ci_lower = st.binomtest(k_correct, n_filtered, p=0.5, alternative='greater').proportion_ci(confidence_level=0.95).low
-                        passed_edge_acc = ci_lower >= edge_check_thr
-                        logger.info(f"Fold {self.current_fold}: Edge Acc Check (edge >= {self.fold_edge_threshold:.2f}): Acc={edge_accuracy:.3f} ({k_correct}/{n_filtered}), 95% CI Lower={ci_lower:.3f} >= {edge_check_thr} -> {'Pass' if passed_edge_acc else 'FAIL'}")
-                    except ValueError as binom_err:
-                        logger.error(f"Fold {self.current_fold}: Edge Acc Check: Error calculating binomial test (k={k_correct}, n={n_filtered}): {binom_err}. Check considered FAIL.")
-                        passed_edge_acc = False # Consider error as failure
-            else:
-                logger.error(f"Fold {self.current_fold}: Edge Acc Check: Calculation failed (NaN). Check considered FAIL.")
-                if n_filtered == 0:
-                    logger.error(f"    Reason: No validation samples met the edge threshold >= {self.fold_edge_threshold:.2f}")
-                passed_edge_acc = False # Consider NaN or 0 samples as failure
-        except Exception as e:
-            logger.error(f"Fold {self.current_fold}: Edge Acc Check: Unexpected error during calculation: {e}. Check considered FAIL.", exc_info=True)
-            passed_edge_acc = False # Consider error as failure
-        # --- End Edge Accuracy Check --- #
-        
-        # --- Brier Score Check (Revision 5) --- #
-        brier_score = np.nan
-        passed_brier = True # Default to Pass (will be set False if check runs and fails)
-        if is_ternary:
-             logger.warning(f"Fold {self.current_fold}: Brier score check currently only implemented for binary classification. Skipping for ternary.")
-             # Keep passed_brier = True for ternary to avoid blocking pipeline
-        else:
-            # Binary case
-            passed_brier = False # Reset to False for binary case
-            try:
-                brier_score = calculate_brier_score(y_true=y_dir_val, p_cal=p_cal_val)
-                if not np.isnan(brier_score):
-                     passed_brier = brier_score <= brier_check_thr
-                     logger.info(f"Fold {self.current_fold}: Brier Score Check: Score={brier_score:.4f} <= {brier_check_thr} -> {'Pass' if passed_brier else 'FAIL'}")
-                else:
-                     logger.error(f"Fold {self.current_fold}: Brier Score Check: Calculation failed (NaN). Check considered FAIL.")
-                     # passed_brier remains False
-            except Exception as e:
-                 logger.error(f"Fold {self.current_fold}: Brier Score Check: Error calculating Brier score: {e}. Check considered FAIL.", exc_info=True)
-                 # passed_brier remains False
-        # --- End Brier Score Check --- #
-
-        # --- Final Decision --- #
-        if not passed_edge_acc or not passed_brier:
-             error_msg = f"FOLD {self.current_fold} GRU VALIDATION FAILED: Edge Acc Pass={passed_edge_acc} (Req CI>={edge_check_thr}), Brier Pass={passed_brier} (Req Score<={brier_check_thr}). Aborting fold."
-             logger.error(error_msg)
-             # Use sys.exit with a specific message for clarity
-             sys.exit(f"Fold {self.current_fold}: GRU validation gates failed (Edge Acc / Brier Score).")
-        else: # Corrected indentation
-             logger.info(f"Fold {self.current_fold}: GRU validation checks passed (Edge Acc & Brier Score).") # Corrected indentation
-    # --- End Validation Helper --- #
-
-    def train_or_load_sac(self):
-        """Trains a new SAC agent offline or loads a pre-trained one for backtesting."""
-        logging.info("--- Stage: Training or Loading SAC Agent ---")
-        train_sac_flag = self.config['control'].get('train_sac', False)
-
-        if train_sac_flag:
-            if self.gru_model_run_id_loaded_from is None:
-                logging.error("Cannot run SAC training: GRU model run ID is not set (no model trained or loaded). Aborting.")
-                sys.exit(1)
-            
-            logging.info(f"SAC training is enabled. Instantiating SACTrainer...")
-
-            # --- Determine Edge Threshold for SAC --- #
-            # Use the threshold determined during calibration (optimized or default)
-            edge_threshold_for_sac = self.optimized_edge_threshold if self.optimized_edge_threshold is not None else \
-                                     self.config.get('calibration', {}).get('edge_threshold', 0.1)
-            logger.info(f"Using edge threshold {edge_threshold_for_sac:.4f} for SAC Trainer (heuristic seeding / env info)...")
-    
-            # --- Prepare Config for SAC Trainer --- #
-            # Create a copy of the config to potentially pass modified values
-            # Note: SACTrainer should ideally accept parameters like edge_threshold directly
-            # For now, we modify the dict copy passed to its constructor.
-            sac_trainer_config = self.config.copy()
-            # Ensure the calibration section reflects the threshold to be used
-            if 'calibration' not in sac_trainer_config: sac_trainer_config['calibration'] = {}
-            sac_trainer_config['calibration']['edge_threshold'] = edge_threshold_for_sac
-            # Also disable rolling calibration in the copy if it was enabled in the main config
-            if sac_trainer_config.get('calibration', {}).get('rolling_enabled', False):
-                logger.warning("SAC training enabled AND rolling calibration enabled. Disabling rolling calibration for the SAC training environment to prevent data leakage.")
-                sac_trainer_config['calibration']['rolling_enabled'] = False
-            # --- End Prepare Config --- #
-
-            # Instantiate SACTrainer, passing necessary base directories from the main pipeline
-            # Ensure logs/results dirs exist
-            base_logs = self.dirs.get('logs')
-            if not base_logs: 
-                 base_logs = os.path.join(project_root, 'logs')
-                 os.makedirs(base_logs, exist_ok=True)
-                 logging.warning(f"Using default base logs dir for SACTrainer: {base_logs}")
-                 
-            base_results = self.dirs.get('results')
-            if not base_results:
-                 base_results = os.path.join(project_root, 'results')
-                 os.makedirs(base_results, exist_ok=True)
-                 logging.warning(f"Using default base results dir for SACTrainer: {base_results}")
-                 
-            self.sac_trainer = SACTrainer(
-                config=sac_trainer_config, # Pass the potentially modified config
-                base_models_dir=self.base_models_dir_path,
-                base_logs_dir=base_logs,
-                base_results_dir=base_results 
-            )
-            
-            # --- Remove the old Revision 1 block that modified self.sac_trainer.config --- #
-            # # --- Revision 1: Handle Rolling Calibrator Conflict --- #
-            # ... (block removed) ...
-            # # --- End Revision 1 --- #
-            
-            # Start the training process
-            final_agent_path = self.sac_trainer.train(gru_run_id_for_sac=self.gru_model_run_id_loaded_from)
-            
-            if final_agent_path:
-                logger.info(f"SAC training completed. Final agent saved at: {final_agent_path}")
-                # Set the agent path to the newly trained agent for subsequent backtesting
-                self.sac_agent_load_path = final_agent_path
-                
-                # --- V3 Output Contract: Plot SAC Reward Curve --- #
-                if self.io and self.config.get('control', {}).get('generate_plots', True):
-                    # Path to the rewards CSV logged by SACTrainer
-                    # sac_trainer instance should have the sac_run_id and logs_dir path
-                    sac_log_dir = self.sac_trainer.sac_run_logs_dir
-                    rewards_csv_path = os.path.join(sac_log_dir, 'episode_rewards.csv')
-                    
-                    if os.path.exists(rewards_csv_path):
-                        logging.info(f"Plotting SAC reward curve from {rewards_csv_path}...")
-                        try:
-                            rewards_df = pd.read_csv(rewards_csv_path)
-                            
-                            if not rewards_df.empty and 'episode_reward' in rewards_df.columns and 'total_step' in rewards_df.columns:
-                                # Calculate EMA of reward
-                                rewards_df['reward_ema'] = rewards_df['episode_reward'].ewm(alpha=0.2, adjust=False).mean()
-                                
-                                # Get figure settings
-                                fig_dpi = self.config.get('output', {}).get('figure_dpi', 150)
-                                fig_size = self.config.get('output', {}).get('figure_size', [16, 9])
-                                footer_text = "© GRU-SAC v3"
-                                
-                                plt.style.use('seaborn-v0_8-darkgrid')
-                                fig, ax1 = plt.subplots(figsize=fig_size)
-
-                                color1 = 'tab:blue'
-                                ax1.set_xlabel('Training Steps')
-                                ax1.set_ylabel('Smoothed Episode Reward (EMA 0.2)', color=color1)
-                                ax1.plot(rewards_df['total_step'], rewards_df['reward_ema'], color=color1, label='Reward EMA (0.2)')
-                                ax1.tick_params(axis='y', labelcolor=color1)
-                                ax1.grid(True, linestyle='--', alpha=0.6)
-
-                                # --- Placeholder for Action Variance / Checkpoints (Not currently logged) ---
-                                # logging.warning("Action variance and checkpoint steps not currently logged in episode_rewards.csv. Omitting from plot.")
-                                # ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
-                                # color2 = 'tab:red'
-                                # ax2.set_ylabel('Action Variance', color=color2)  # we already handled the x-label with ax1
-                                # ax2.plot(steps, action_variance_data, color=color2, linestyle=':', label='Action Variance')
-                                # ax2.tick_params(axis='y', labelcolor=color2)
-                                # Add checkpoint vertical lines: ax1.axvline(x=chkpt_step, color='grey', linestyle='--', linewidth=0.5)
-                                # --- End Placeholder ---
-
-                                fig.suptitle('SAC Training Reward Curve', fontsize=16)
-                                # Add footer
-                                plt.figtext(0.99, 0.01, footer_text, horizontalalignment='right', 
-                                            verticalalignment='bottom', fontsize=8, color='gray')
-                                            
-                                plt.tight_layout(rect=[0, 0.03, 1, 0.95])
-
-                                # Save figure using IOManager (save to the main pipeline's results dir)
-                                self.io.save_figure(fig, "sac_reward_plot", section='results')
-                                logging.info("SAC reward curve plot saved.")
-                                plt.close(fig)
-                            else:
-                                logging.warning("Episode rewards CSV is empty or missing required columns ('episode_reward', 'total_step'). Skipping plot.")
-                        except FileNotFoundError:
-                            logging.warning(f"SAC rewards file not found at {rewards_csv_path}. Cannot plot reward curve.")
-                        except Exception as e:
-                            logging.error(f"Failed to plot SAC reward curve: {e}", exc_info=True)
-                    else:
-                        logging.warning(f"SAC rewards file not found at {rewards_csv_path}. Cannot plot reward curve.")
-                elif not self.io:
-                     logging.warning("IOManager not available, skipping SAC reward curve plot.")
-                # --- End Plot SAC Reward Curve --- #
-
-            else:
-                logger.error("SAC training failed. Proceeding without a newly trained agent.")
-                # Decide whether to fallback to loading or abort? Fallback for now.
-                self.sac_agent_load_path = self._determine_sac_load_path_from_config()
-                if self.sac_agent_load_path:
-                     logger.warning(f"Falling back to loading SAC agent specified in config: {self.sac_agent_load_path}")
-                else:
-                     logger.error("SAC training failed and no load path specified in config. Cannot proceed with backtesting.")
-                     # Optionally exit: sys.exit(1) 
-                     # For now, allow pipeline to continue, backtester should handle None path
-
-        else: # Load SAC agent based on config for backtesting
-            logging.info("SAC training is disabled (train_sac=False). Determining agent path to load for backtesting...")
-            self.sac_agent_load_path = self._determine_sac_load_path_from_config()
-            if self.sac_agent_load_path:
-                 logger.info(f"SAC agent path for backtesting set to load from: {self.sac_agent_load_path}")
-            else:
-                 logger.warning("No 'sac_load_run_id' specified in config. Backtester will need to handle using untrained/initial weights.")
-
-    def _determine_sac_load_path_from_config(self) -> str | None:
-        """Helper to determine the SAC agent load path based on config control flags."""
-        load_run_id = self.config['control'].get('sac_load_run_id')
-        load_step = self.config['control'].get('sac_load_step', 'final')
-        sac_agent_path = None
-        if load_run_id:
-            # Construct path assuming structure like: <models_base>/<sac_train_run_id>/agent_<step>.pt
-            # The sac_train_run_id usually differs from the pipeline run_id
-            models_base = self.base_models_dir_path # Use the stored base models path
-            # Assume the SAC trainer saves checkpoints inside its own run folder (e.g., models/sac_train_.../sac_agent_final)
-            if load_step == 'final':
-                # SAC trainer saves final model in a folder named 'sac_agent_final'
-                sac_agent_path = os.path.join(models_base, load_run_id, 'sac_agent_final')
-            else:
-                # SAC trainer saves step checkpoints in folder 'sac_agent_step_N'
-                sac_agent_path = os.path.join(models_base, load_run_id, f'sac_agent_step_{load_step}')
-            
-            # Check if the determined path exists
-            if not os.path.exists(sac_agent_path):
-                 logger.warning(f"Determined SAC load path does not exist: {sac_agent_path}. Will proceed without loading specified agent.")
-                 sac_agent_path = None # Reset path if not found
-                 
-        return sac_agent_path
-
-    def run_backtest(self):
-        """Runs the backtest and checks performance criteria for the current fold."""
-        logger.info(f"--- Fold {self.current_fold}: Stage: Running Backtest ---")
-        
-        # --- Extract original prices for backtest --- #
-        if self.df_test_original is None or self.df_test_original.empty:
-             logger.error(f"Fold {self.current_fold}: Original test data (df_test_original) is missing. Cannot run backtest.")
-             sys.exit(f"Fold {self.current_fold}: Missing original test data for backtest.")
-             
-        # Ensure required columns exist
-        required_cols = ['open', 'high', 'low', 'close', 'volume']
-        if not all(col in self.df_test_original.columns for col in required_cols):
-             missing_cols = [col for col in required_cols if col not in self.df_test_original.columns]
-             logger.error(f"Fold {self.current_fold}: Original test data missing required price columns: {missing_cols}. Cannot run backtest.")
-             sys.exit(f"Fold {self.current_fold}: Missing required price columns in original test data.")
-             
-        original_prices = self.df_test_original[required_cols]
-        # --- End Price Extraction --- #
-
-        # Run the backtest using the Backtester instance
-        # Note: Need to handle potential absence of test sequences if fold is too short
-        if self.X_test_seq is None or self.y_test_seq_dict is None or self.test_indices is None:
-            logger.warning(f"Fold {self.current_fold}: Test sequences not available (likely due to fold length/lookback). Skipping backtest stage for this fold.")
-            # Set results to None to indicate skip
-            self.backtest_results_df = None
-            self.backtest_metrics = None
-            self.metrics_log_df = None
-            return # Skip rest of the backtest stage
-        
-        # Pass the appropriate calibrator instance
-        calibrator_instance = None
-        vector_calibrator_instance = None
-        calibration_method = self.config.get('calibration',{}).get('method')
-        if calibration_method == 'temperature':
-            calibrator_instance = self.calibrator
-            if not hasattr(self, 'optimal_T'): # Ensure optimal_T was set
-                 logger.error(f"Fold {self.current_fold}: Temperature calibration selected but optimal_T not found.")
-                 raise SystemExit(f"Fold {self.current_fold}: Missing optimal_T for backtest.")
-        elif calibration_method == 'vector':
-            vector_calibrator_instance = self.vector_calibrator
-            if not hasattr(self, 'vector_cal_params'): # Ensure params were set
-                 logger.error(f"Fold {self.current_fold}: Vector calibration selected but vector_cal_params not found.")
-                 raise SystemExit(f"Fold {self.current_fold}: Missing vector_cal_params for backtest.")
-            
-        # Get raw predictions needed for rolling calibration
-        p_raw_test_for_bt = None
-        logits_test_for_bt = None
-        is_ternary = self.config.get('gru', {}).get('use_ternary_output', False) # Need to know if ternary
-        if self.config.get('calibration', {}).get('rolling_enabled', False):
-             logger.info(f"Fold {self.current_fold}: Getting raw GRU outputs for rolling calibration...")
-             if is_ternary:
-                  logits_test_for_bt = self.gru_handler.predict_logits(self.X_test_seq)
-                  if logits_test_for_bt is None:
-                       logger.error(f"Fold {self.current_fold}: Failed to get GRU logits for rolling calibration.")
-                       raise SystemExit(f"Fold {self.current_fold}: Failed GRU logit prediction.")
-             else: # Corrected indentation
-                  preds_test_raw = self.gru_handler.predict(self.X_test_seq)
-                  if preds_test_raw is None or len(preds_test_raw) < 3:
-                       logger.error(f"Fold {self.current_fold}: Failed to get GRU raw predictions for rolling calibration.")
-                       raise SystemExit(f"Fold {self.current_fold}: Failed GRU raw prediction.")
-                  p_raw_test_for_bt = preds_test_raw[2].flatten() # Assuming index 2 is probabilities
-
-        # Get the edge threshold determined during validation (optimized or fixed)
-        edge_threshold_for_bt = getattr(self, 'fold_edge_threshold', self.config.get('calibration', {}).get('edge_threshold', 0.1))
-        logger.info(f"Fold {self.current_fold}: Using edge threshold {edge_threshold_for_bt:.4f} for backtest execution.")
-            
-        try: # Corrected indentation
-            self.backtest_results_df, self.backtest_metrics, self.metrics_log_df = self.backtester.run_backtest(
-                sac_agent_load_path=self.sac_agent_load_path,
-                X_test_seq=self.X_test_seq,
-                y_test_seq_dict=self.y_test_seq_dict,
-                test_indices=self.test_indices,
-                gru_handler=self.gru_handler,
-                    # --- Pass Calibrator instances and initial state --- #
-                    calibrator=calibrator_instance, 
-                    vector_calibrator=vector_calibrator_instance, 
-                    initial_optimal_T=getattr(self, 'optimal_T', None),        # Pass T if exists
-                    initial_vector_params=getattr(self, 'vector_cal_params', None), # Pass params if exists
-                    fold_edge_threshold=edge_threshold_for_bt,
-                    # --- Pass raw predictions if needed for rolling cal --- # 
-                    p_raw_test=p_raw_test_for_bt,
-                    logits_test=logits_test_for_bt,
-                    # --- Pass original prices --- #
-                    original_prices=self.df_test_original, # Pass the DataFrame
-                    is_ternary=self.use_ternary, 
-                    fold_num=self.current_fold
-                )
-        except SystemExit as e: # Corrected indentation
-             # Catch exits from backtester validation/execution
-             logger.error(f"Fold {self.current_fold}: Backtest aborted: {e}")
-             raise # Re-raise to stop the fold
-        except Exception as e: # Corrected indentation
-             logger.error(f"Fold {self.current_fold}: Unhandled error during backtester.run_backtest: {e}", exc_info=True)
-             # Treat as failure, ensure metrics are None
-             self.backtest_results_df = None
-             self.backtest_metrics = None
-             self.metrics_log_df = None
-             # Re-raise or exit? Let the main pipeline catch it.
-             raise SystemExit(f"Fold {self.current_fold}: Backtest execution failed unexpectedly.") from e
-
-        if self.backtest_results_df is None or self.backtest_metrics is None:
-            # This case should ideally be caught by exceptions above now
-            logger.error(f"Fold {self.current_fold}: Backtesting failed to produce results (post-execution check).")
-            raise SystemExit(f"Fold {self.current_fold}: Backtest failed to produce results.")
-        else:
-            logger.info(f"Fold {self.current_fold}: Backtest completed successfully.")
-
-        # --- Backtest Success Criteria Check (Now redundant as checks are inside run_backtest) --- #
-        # logger.info(f"Fold {self.current_fold}: Checking backtest performance against success criteria...")
-        # ... (Remove the check block here) ...
-        # --- End Backtest Check --- #
-
-    def save_results(self):
-        """Saves backtest results, metrics, and plots using the Backtester instance for the current fold."""
-        logger.info(f"--- Fold {self.current_fold}: Stage: Saving Results --- ")
-        if self.backtest_results_df is None or self.backtest_metrics is None:
-            logger.warning(f"Fold {self.current_fold}: No backtest results available to save. Skipping.")
-            return
-            
-        # Use IOManager to get fold-specific results directory if possible
-        results_dir = self.fold_dirs.get('results')
-        if not results_dir:
-             logger.warning(f"Fold {self.current_fold}: Results dir for fold not found. Saving results to main run dir.")
-             results_dir = self.dirs.get('results') # Fallback to main run results
-
-        if not results_dir:
-             logger.error(f"Fold {self.current_fold}: Could not determine valid results directory. Cannot save backtest results.")
-             return
-             
-        # Pass results to the backtester's save method
-        self.backtester.save_results(
-            results_df=self.backtest_results_df,
-            metrics=self.backtest_metrics,
-            results_dir=results_dir, # Pass the determined directory
-            run_id=self.run_id, # Pass overall run_id for context in plots/reports
-            metrics_log_df=self.metrics_log_df,
-            fold_num=self.current_fold # Pass fold number for unique filenames
-        )
-
-    def evaluate_feature_ab_test(self, feature_name, feature_values):
-        """
-        Performs A/B test for a new candidate feature.
-        
-        Args:
-            feature_name (str): Name of the candidate feature
-            feature_values (pd.Series or np.array): Values of the feature to test
-            
-        Returns:
-            tuple: (passed_gate, improvement, p_value) - whether feature improved accuracy by ≥1% with p<0.05
-        """
-        logging.info(f"--- A/B Testing Feature: {feature_name} ---")
-        
-        if self.X_train_scaled is None or self.y_train is None:
-            logging.error("Scaled features or targets not available for A/B test. Skipping.")
-            return False, 0, 1.0
-            
-        horizon = self.config['gru'].get('prediction_horizon', 5)
-        target_dir_col = f'direction_label_{horizon}'
-        
-        if target_dir_col not in self.y_train.columns:
-            logging.error(f"Target direction column '{target_dir_col}' not found in y_train. Skipping A/B test.")
-            return False, 0, 1.0
-            
-        y_train_dir = self.y_train[target_dir_col]
-        
-        try:
-            # Split train into teaching and validation sets
-            X_teach, X_val_subset, y_teach, y_val_subset = train_test_split(
-                self.X_train_scaled, y_train_dir, test_size=0.2, shuffle=False
-            )
-            
-            # Baseline model (A) - without the new feature
-            model_a = LogisticRegression(max_iter=1000, solver="lbfgs", random_state=42)
-            model_a.fit(X_teach, y_teach)
-            y_pred_a = model_a.predict(X_val_subset)
-            accuracy_a = (y_pred_a == y_val_subset).mean()
-            
-            # Add the new feature to X_teach and X_val_subset
-            if len(feature_values) != len(self.X_train_scaled):
-                logging.error(f"Feature length mismatch: feature has {len(feature_values)} values, but X_train has {len(self.X_train_scaled)} rows")
-                return False, 0, 1.0
-                
-            # Create copies of data with the new feature added
-            X_teach_b = X_teach.copy()
-            X_val_subset_b = X_val_subset.copy()
-            
-            # Determine which indices to use from the feature_values
-            teach_indices = X_teach.index
-            val_indices = X_val_subset.index
-            
-            # Add feature to both datasets
-            if isinstance(feature_values, pd.Series):
-                # If it's a Series, align by index
-                X_teach_b[feature_name] = feature_values.loc[teach_indices]
-                X_val_subset_b[feature_name] = feature_values.loc[val_indices]
-            else:
-                # If it's a numpy array, we need the original indices in the full dataset
-                # This assumes X_teach and X_val_subset came from contiguous parts of X_train
-                X_teach_b[feature_name] = feature_values[:len(X_teach)]
-                X_val_subset_b[feature_name] = feature_values[len(X_teach):len(X_teach)+len(X_val_subset)]
-            
-            # Model with new feature (B)
-            model_b = LogisticRegression(max_iter=1000, solver="lbfgs", random_state=42)
-            model_b.fit(X_teach_b, y_teach)
-            y_pred_b = model_b.predict(X_val_subset_b)
-            accuracy_b = (y_pred_b == y_val_subset).mean()
-            
-            # Calculate improvement
-            improvement = accuracy_b - accuracy_a
-            
-            # Calculate statistical significance with two-proportion z-test
-            n = len(y_val_subset)
-            count_correct_a = int(accuracy_a * n)
-            count_correct_b = int(accuracy_b * n)
-            
-            # Use proportion_test from statsmodels for the z-test
-            from statsmodels.stats.proportion import proportions_ztest
-            
-            # Format data for the test
-            count = np.array([count_correct_a, count_correct_b])
-            nobs = np.array([n, n])
-            
-            # Perform the test (alternative='larger' tests if B > A)
-            z_stat, p_value = proportions_ztest(count, nobs, alternative='larger')
-            
-            # Determine if the feature passes the gate: B-A ≥ 0.01 and p < 0.05
-            passes_gate = improvement >= 0.01 and p_value < 0.05
-            
-            logging.info(f"A/B Test Results for '{feature_name}':")
-            logging.info(f"  Baseline accuracy (A): {accuracy_a:.3f}")
-            logging.info(f"  With new feature (B): {accuracy_b:.3f}")
-            logging.info(f"  Improvement (B-A): {improvement:.3f}")
-            logging.info(f"  p-value: {p_value:.5f}")
-            logging.info(f"  Passes gate (B-A ≥ 0.01 and p < 0.05): {passes_gate}")
-            
-            return passes_gate, improvement, p_value
-            
-        except Exception as e:
-            logging.error(f"Failed to perform A/B test for feature '{feature_name}': {e}", exc_info=True)
-            return False, 0, 1.0
-
-    # --- Wrapper Methods for Notebook Step-by-Step Execution ---
-
-    def load_data(self):
-        """Wrapper for load_and_preprocess_data for notebook execution."""
-        logging.info("--- Notebook Step: Load Data (Calling load_and_preprocess_data) ---")
-        self.load_and_preprocess_data()
-        # Store the primary result on self for notebook inspection
-        self.raw_data = self.df_raw 
-        logging.info(f"Stored raw_data attribute. Shape: {self.raw_data.shape if self.raw_data is not None else 'None'}")
-
-    def prepare_sequences(self):
-        """Wrapper for the sequence preparation steps for notebook execution."""
-        logging.info("--- Notebook Step: Prepare Sequences (Calling internal steps) ---")
-        # Call the internal steps in the correct order
-        self.define_labels_and_align()
-        self.split_data()
-        self.select_and_prune_features()
-        self.scale_features()
-        # self.run_baseline_checks() # Optionally include if desired in this step
-        self.create_sequences()
-        logging.info("Finished sequence preparation steps.")
-        # Store key results on self for notebook inspection (add more as needed)
-        self.train_sequences = self.X_train_seq
-        self.val_sequences = self.X_val_seq
-        self.test_sequences = self.X_test_seq
-        self.train_targets = self.y_train_seq_dict # Assuming create_sequences stores dict here
-        self.val_targets = self.y_val_seq_dict
-        self.test_targets = self.y_test_seq_dict
-
-    def calibrate_predictions(self):
-        """Wrapper for calibrate_probabilities for notebook execution."""
-        logging.info("--- Notebook Step: Calibrate Predictions (Calling calibrate_probabilities) ---")
-        self.calibrate_probabilities()
-        # Store results on self for notebook inspection
-        self.optimal_threshold = self.optimal_T # Keep existing name for compatibility? Or use optimal_T?
-        self.optimal_calibration_params = self.vector_cal_params if self.use_ternary else self.optimal_T # Unified name
-        logging.info(f"Stored optimal_calibration_params: {self.optimal_calibration_params}")
-
-    # --- Main Execution Method ---
-
-    def execute(self):
-        """Runs the full trading pipeline end-to-end."""
-        logger.info(f"--- Starting Trading Pipeline: Run ID {self.run_id} ---")
-
-        # 1. Load and Preprocess Data
-        self.load_and_preprocess_data()
-        if self.data_processed is None: # Check if data loading failed
-            logger.error("Data loading failed. Exiting pipeline.")
-            return
-
-        # 2. Engineer Features
-        self.engineer_features()
-
-        # 3. Define Labels and Align
-        self.define_labels_and_align()
-        if self.data_processed is None: # Check if label generation failed
-            logger.error("Label generation failed. Exiting pipeline.")
-            return
-
-        # 4. Split Data
-        self.split_data()
-
-        # 5. Scale Features
-        self.scale_features()
-
-        # --- MODIFIED ORDER ---
-        # 6. Baseline Checks (Now before pruning and sequencing)
-        self.run_baseline_checks() # Exits if baseline fails
-        logger.info("Baseline checks passed.")
-
-        # 7. Select/Prune Features (Now before sequencing)
-        self.select_and_prune_features()
-
-        # 8. Create Sequences (Now after scaling, baseline, pruning)
-        self.create_sequences()
-        # --- END MODIFIED ORDER ---
-
-        # 9. Train/Load GRU Model
-        self.train_or_load_gru()
-
-        # 10. Calibrate Probabilities
-        self.calibrate_probabilities()
-        if self.gru_model_handler is None or self.gru_model_handler.model is None:
-             logger.warning("GRU model not available, skipping edge accuracy check.")
-        elif not hasattr(self, 'p_cal_val'):
-             logger.warning("Calibrated validation probabilities not found, skipping edge accuracy check.")
-        else:
-             # Perform edge accuracy check only if calibration happened and model exists
-             self._perform_gru_validation_checks(
-                  p_cal_val=self.p_cal_val,
-                  y_dir_val=self.y_dir_val,
-                  is_ternary=self.use_ternary
-             )
-
-        # 11. Train/Load SAC Agent
-        self.train_or_load_sac()
-
-        # 12. Run Backtest
-        self.run_backtest()
-
-        # 13. Save Results & Final Validation
-        self.save_results() # Includes final Sharpe/DD checks, exits if failed
-
-        logger.info(f"--- Trading Pipeline Finished: Run ID {self.run_id} ---")
-
-    # --- Walk-Forward Fold Generation --- #
-    def _generate_walk_forward_folds(self) -> Iterator[Tuple[pd.Timestamp, pd.Timestamp, pd.Timestamp, pd.Timestamp, pd.Timestamp, pd.Timestamp]]:
-        """
-        Generates start and end timestamps for train, validation, and test sets
-        for each walk-forward fold based on config settings.
-        Requires self.df_raw to be loaded first to determine the full date range.
-        """
-        wf_config = self.config.get('walk_forward', {})
-        if not wf_config.get('enabled', False):
-            logger.info("Walk-forward validation disabled. Performing single split.")
-            # Yield a single pseudo-fold covering the entire dataset range
-            # The split_data method will handle the ratio-based split for this single fold
-            yield (self.df_raw.index.min(), self.df_raw.index.max(),
-                   None, None, # Placeholder val dates
-                   None, None) # Placeholder test dates
-            return
-
-        train_days = wf_config.get('train_days', 60)
-        val_days = wf_config.get('val_days', 14)
-        test_days = wf_config.get('test_days', 14)
-        step_days = wf_config.get('step_days', 14)
-        offset_days = wf_config.get('initial_offset_days', 0)
-
-        if not isinstance(self.df_raw.index, pd.DatetimeIndex):
-            raise ValueError("Raw data index must be DatetimeIndex for walk-forward validation.")
-            
-        full_start_date = self.df_raw.index.min() + timedelta(days=offset_days)
-        full_end_date = self.df_raw.index.max()
-        current_start = full_start_date
-
-        logger.info(f"Generating Walk-Forward Folds: Train={train_days}d, Val={val_days}d, Test={test_days}d, Step={step_days}d")
-        logger.info(f"Full Data Range for Folds: {full_start_date} to {full_end_date}")
-
-        fold_num = 0
-        while True:
-            train_start = current_start
-            train_end = train_start + timedelta(days=train_days)
-            val_start = train_end
-            val_end = val_start + timedelta(days=val_days)
-            test_start = val_end
-            test_end = test_start + timedelta(days=test_days)
-
-            # Check if the test period goes beyond the available data
-            if test_end > full_end_date:
-                logger.info(f"Stopping fold generation. Next test period ({test_start} to {test_end}) exceeds available data end date ({full_end_date}).")
-                break
-                
-            # Ensure we have at least some data in each period (basic check)
-            if train_end <= train_start or val_end <= val_start or test_end <= test_start:
-                 logger.warning(f"Fold {fold_num}: Invalid date ranges calculated. Skipping.")
-                 # Advance start date and retry
-                 current_start += timedelta(days=step_days)
-                 continue
-
-            logger.info(f"  Fold {fold_num}: Train=[{train_start}, {train_end}), Val=[{val_start}, {val_end}), Test=[{test_start}, {test_end})")
-            yield (train_start, train_end, val_start, val_end, test_start, test_end)
-            
-            # Advance the start for the next fold
-            current_start += timedelta(days=step_days)
-            fold_num += 1
-            
-        if fold_num == 0:
-             logger.error("No valid walk-forward folds could be generated. Check data range and WF config.")
-
-    # --- Core Pipeline Steps (Modified for Potential Fold Context) --- #
-    # Methods like load_and_preprocess_data, engineer_features, etc., 
-    # might need adjustments if they rely heavily on `self` state 
-    # that changes per fold. For now, assume they operate on data passed 
-    # or reset their relevant `self` attributes appropriately.
-    
-    # Example modification for split_data:
-    def split_data(self, df_fold_data: pd.DataFrame, fold_dates: Tuple = None):
-        """Splits features and targets for a given fold based on dates or ratios."""
-        logging.info("--- Stage: Splitting Data for Fold --- ")
-        # ... (Keep internal logic, but operate on df_fold_data) ...
-        # ... (Use fold_dates if provided for WF, else use ratios for single split) ...
-        if fold_dates and fold_dates[2] is not None: # Walk-forward fold
-            train_start, train_end, val_start, val_end, test_start, test_end = fold_dates
-            # Select data based on dates
-            self.X_train_raw = df_fold_data[train_start:train_end].drop(columns=self.target_columns)
-            self.y_train = df_fold_data.loc[self.X_train_raw.index, self.target_columns]
-            self.y_dir_train = df_fold_data.loc[self.X_train_raw.index, self.target_dir_col]
-            # ... similar slicing for val and test ...
-            self.X_val_raw = df_fold_data[val_start:val_end].drop(columns=self.target_columns)
-            self.y_val = df_fold_data.loc[self.X_val_raw.index, self.target_columns]
-            self.X_test_raw = df_fold_data[test_start:test_end].drop(columns=self.target_columns)
-            self.y_test = df_fold_data.loc[self.X_test_raw.index, self.target_columns]
-            # Store original data slices too
-            self.df_train_original = df_fold_data[train_start:train_end]
-            self.df_val_original = df_fold_data[val_start:val_end]
-            self.df_test_original = df_fold_data[test_start:test_end]
-        else: # Single split (WF disabled or first pseudo-fold)
-            split_cfg = self.config['split_ratios'] # Fallback to ratios
-            # ... (existing ratio-based split logic using df_fold_data) ...
-            total_len = len(df_fold_data)
-            train_end_idx = int(total_len * split_cfg['train'])
-            val_end_idx = int(total_len * (train_ratio + val_ratio))
-            # ... etc ...
-        # ... (Log split shapes) ...
+        # --- End Updated call --- #
+
+        # Update self state with the results from the stage function
+        self.X_train_raw = X_train_raw
+        self.y_train = y_train
+        self.X_val_raw = X_val_raw
+        self.y_val = y_val
+        self.X_test_raw = X_test_raw
+        self.y_test = y_test
+        self.df_train_original = df_train_original
+        self.df_val_original = df_val_original
+        self.df_test_original = df_test_original
+        self.y_dir_train_ordinal = y_dir_train_ordinal
+        # --- Store split returns and eps --- #
+        self.fwd_ret_train = fwd_ret_train
+        self.eps_train = eps_train
+        self.fwd_ret_val = fwd_ret_val
+        self.eps_val = eps_val
+        self.y_dir_val_ordinal = y_dir_val_ordinal # <<< ADDED
+        # --- End Store --- #
+
+        logger.info(f"Fold {self.current_fold}: Data splitting stage complete.")
 
     # --- Baseline moved earlier (operates on raw/engineered fold train data) ---
-    def run_baseline_checks(self, X_train_fold_raw: pd.DataFrame, y_train_fold_dir: pd.Series):
-        """Runs baseline Logistic Regression check on fold's raw/engineered training data.
-           Called *before* scaling and pruning.
-        """
-        logger.info(f"--- Fold {self.current_fold}: Stage: Baseline Checks (Logistic Regression on Raw/Engineered Features) --- ")
-         
-         # Skip if ternary
-        if self.use_ternary:
-             logger.warning(f"Fold {self.current_fold}: Using ternary labels. Skipping binary Logistic Regression baseline check.")
-             return
-         
-         # Check inputs
-        if X_train_fold_raw is None or y_train_fold_dir is None:
-              logger.error(f"Fold {self.current_fold}: Raw training data not available for baseline check. Skipping.")
-              return
-              
-        baseline_gate_cfg = self.config.get('validation_gates', {}).get('baseline', {})
-        required_ci_lb = baseline_gate_cfg.get('ci_lower_bound_threshold', 0.52) # Default if not in config
-        logger.info(f"Fold {self.current_fold}: Baseline check required CI lower bound >= {required_ci_lb}")
+    def run_baseline_checks(self, fold_num: int):
+        """Placeholder method to call the baseline check stage function."""
+        logger.info(f"--- Fold {fold_num}: Invoking Baseline Check Stage --- ")
 
-        # --- Use BaselineChecker --- #
+        if self.baseline_checker is None:
+            logger.error(f"Fold {fold_num}: BaselineChecker not initialized. Cannot run baseline checks.")
+            # Optionally, initialize it here if appropriate, or ensure it's done in __init__ or setup
+            # For now, we'll rely on it being initialized earlier.
+            # raise SystemExit(f"Fold {fold_num}: BaselineChecker missing.")
+            # Re-initializing here if logic allows:
+            if self.config.get('validation_gates', {}).get('run_baseline_check', False):
+                 logger.info(f"Fold {fold_num}: Initializing BaselineChecker for baseline checks.")
+                 self.baseline_checker = BaselineChecker(self.config)
+            else:
+                 logger.warning(f"Fold {fold_num}: Baseline checks are disabled in config, but BaselineChecker was not initialized.")
+                 # Decide if this is an error or just a skip condition
+                 return # Skip if checks disabled and checker not ready
+
+        # Retrieve necessary data from state
+        X_train_raw = self.X_train_raw
+        y_train_dir = self.y_dir_train_ordinal # Get the specific direction labels from split_data
+        use_ternary = self.use_ternary # Already stored on self
+        fold_dirs = self.fold_dirs # Already stored on self
+
+        # Check if baseline check should run (based on config and ternary status)
+        run_check = self.config.get('validation_gates', {}).get('run_baseline_check', False)
+        if not run_check:
+            logger.info(f"Fold {fold_num}: Skipping baseline checks as per configuration.")
+            return
+        if use_ternary:
+            logger.warning(f"Fold {fold_num}: Skipping baseline checks as ternary targets are enabled.")
+            return
+
+        if X_train_raw is None or y_train_dir is None:
+            logger.error(f"Fold {fold_num}: Cannot run baseline checks. Missing raw training features or direction labels in state.")
+            # Decide on error handling: maybe SystemExit or just log and continue depending on strictness
+            raise SystemExit(f"Fold {fold_num}: Critical data missing for baseline check.")
+
+        # Call the stage function from evaluation.py
         try:
-            # Run the baseline check using the checker
-            baseline_report = self.baseline_checker.run_logistic_baseline(
-                X_train_fold_raw, # Pass the fold's raw/engineered training features 
-                y_train_fold_dir  # Pass the fold's training direction labels
+            run_baseline_checks_fold(
+                X_train_scaled=self.X_train_scaled,
+                X_val_scaled=self.X_val_scaled,
+                y_train_dir_ordinal=self.y_dir_train_ordinal, 
+                fwd_ret_train=self.fwd_ret_train,
+                eps_train=self.eps_train,
+                fwd_ret_val=self.fwd_ret_val,
+                eps_val=self.eps_val,
+                baseline_checker=self.baseline_checker, # Pass the initialized instance
+                config=self.config,
+                io=self.io, # Pass the IOManager instance
+                fold_num=fold_num,
+                fold_dirs=fold_dirs, # Pass fold-specific directories
+                base_results_dir=self.io.run_results_dir if self.io else '.' # Fallback save location
             )
-
-            # --- Save Baseline Report --- #
-            if self.io:
-                try:
-                    # Ensure fold-specific directory exists if possible
-                    fold_results_dir = self.fold_dirs.get('results')
-                    if not fold_results_dir:
-                         logger.warning(f"Fold {self.current_fold}: Results dir for fold not found. Saving baseline report to main run dir.")
-                         fold_results_dir = self.dirs.get('results') # Fallback to main results
-                         
-                    if fold_results_dir: # Check again after fallback
-                         self.io.save_json(
-                              baseline_report, 
-                              f"baseline_report_fold_{self.current_fold}", 
-                              #section='results', # IOManager prepends run_id/results, need relative path within
-                              base_dir=fold_results_dir, # Save directly to fold dir
-                              use_txt=True # Save as .txt
-                         )
-                         logger.info(f"Fold {self.current_fold}: Saved baseline report.")
-                    else:
-                         logger.warning(f"Fold {self.current_fold}: Could not determine valid results directory. Skipping baseline report save.")
-                except Exception as e:
-                    logger.error(f"Fold {self.current_fold}: Failed to save baseline report: {e}")
-            else:
-                logger.warning(f"Fold {self.current_fold}: IOManager not available, skipping saving of baseline report.")
-            # --- End Save --- #
-
-            # --- Success Criteria Check --- #
-            ci_lower_bound = baseline_report.get("ci_lower_bound")
-            
-            if ci_lower_bound is None or np.isnan(ci_lower_bound):
-                error_msg = f"FOLD {self.current_fold} BASELINE CHECK FAILED: Could not determine CI lower bound. Aborting fold."
-                logger.error(error_msg)
-                sys.exit(f"Fold {self.current_fold}: Baseline CI lower bound calculation failed.")
-            elif ci_lower_bound < required_ci_lb:
-                error_msg = f"FOLD {self.current_fold} BASELINE CHECK FAILED: Logistic Regression 95% CI lower bound ({ci_lower_bound:.3f}) is below {required_ci_lb} threshold. Aborting fold."
-                logger.error(error_msg)
-                sys.exit(error_msg)
-            else:
-                success_msg = f"Fold {self.current_fold}: Baseline check passed! Logistic hit-rate 95%-CI lower bound: {ci_lower_bound:.3f} (>= {required_ci_lb})"
-                logger.info(success_msg)
-            # --- End Success Criteria Check --- #
-
-        except SystemExit:
-             raise # Reraise SystemExit to stop the fold
+            logger.info(f"Fold {fold_num}: Baseline check stage completed successfully.")
+        except SystemExit as e:
+            logger.error(f"Fold {fold_num} failed baseline checks. Raising SystemExit to halt pipeline. Reason: {e}")
+            raise # Re-raise SystemExit to stop the pipeline
         except Exception as e:
-            logger.error(f"Fold {self.current_fold}: An error occurred during baseline checks: {e}. Aborting fold.", exc_info=True)
-            # Treat other exceptions as fatal for the fold
-            sys.exit(f"Fold {self.current_fold}: Unhandled exception during baseline check.")
+            logger.error(f"Fold {fold_num}: An unexpected error occurred during the baseline check stage: {e}", exc_info=True)
+            # Depending on policy, either raise SystemExit or just log and potentially continue
+            raise SystemExit(f"Fold {fold_num}: Unhandled exception in baseline check stage.") from e
 
     # --- execute method refactored for Walk-Forward --- #
     def execute(self):
@@ -2295,166 +529,254 @@ class TradingPipeline:
         # 2. Generate Walk-Forward Folds
         fold_generator = self._generate_walk_forward_folds()
         
-        all_fold_metrics = [] # Store metrics from each fold
+        self.all_fold_metrics = [] # Reset fold metrics list for the run
         all_successful_sac_agent_paths = [] # Store paths of successfully trained SAC agents per fold
         fold_count = 0
 
         # 3. Loop Through Folds
         for fold_dates in fold_generator:
             fold_count += 1
-            logger.info(f"=== Processing Fold {fold_count} ===")
+            self.current_fold = fold_count # Set current fold number
+            logger.info(f"=== Processing Fold {self.current_fold} ===")
+
+            # --- Fix Start: Handle single split case ---
+            if fold_dates is None:
+                # Calculate dates for single split based on ratios
+                if not self.df_raw.empty and isinstance(self.df_raw.index, pd.DatetimeIndex):
+                    full_start_date = self.df_raw.index.min()
+                    full_end_date = self.df_raw.index.max()
+                    full_duration = full_end_date - full_start_date
+                    
+                    split_ratios = self.config.get('walk_forward', {}).get('split_ratios', {'train': 0.7, 'validation': 0.15}) # Default ratios
+                    train_ratio = split_ratios.get('train', 0.7)
+                    val_ratio = split_ratios.get('validation', 0.15)
+                    test_ratio = 1.0 - train_ratio - val_ratio
+                    
+                    if not (0 < train_ratio < 1 and 0 < val_ratio < 1 and 0 <= test_ratio < 1 and abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-9):
+                         logger.error(f"Invalid split_ratios in config: {split_ratios}. Ratios must sum to 1. Exiting.")
+                         raise SystemExit("Invalid split_ratios configuration.")
+
+                    train_end_offset = full_start_date + full_duration * train_ratio
+                    val_end_offset = train_end_offset + full_duration * val_ratio
+                    
+                    # Find nearest index points
+                    train_start = full_start_date
+                    # Find the index *before or at* the calculated end time
+                    train_end_idx = self.df_raw.index.get_indexer([train_end_offset], method='ffill')[0]
+                    train_end = self.df_raw.index[train_end_idx]
+                    
+                    val_start_idx = train_end_idx + 1
+                    if val_start_idx >= len(self.df_raw.index):
+                        logger.error("Not enough data points for validation split based on ratios. Exiting.")
+                        raise SystemExit("Insufficient data for validation split.")
+                    val_start = self.df_raw.index[val_start_idx]
+
+                    val_end_idx = self.df_raw.index.get_indexer([val_end_offset], method='ffill')[0]
+                    # Ensure val_end is after val_start
+                    val_end_idx = max(val_end_idx, val_start_idx)
+                    if val_end_idx >= len(self.df_raw.index): # If val_end_offset calculation goes beyond data
+                         val_end = full_end_date
+                    else:
+                         val_end = self.df_raw.index[val_end_idx]
+
+                    test_start_idx = val_end_idx + 1
+                    if test_ratio > 1e-9 and test_start_idx < len(self.df_raw.index): # Only if there's a test set and data left
+                         test_start = self.df_raw.index[test_start_idx]
+                         test_end = full_end_date
+                    else: # No test set or no data left for it
+                         test_start = None
+                         test_end = None
+                         # Adjust val_end to be the absolute end if there's no test set
+                         if test_ratio <= 1e-9:
+                             val_end = full_end_date
+                             logger.info("No test set defined by split_ratios. Validation set extends to end of data.")
+                         else:
+                            logger.warning("Not enough data points remaining for test split. Test set will be empty.")
+
+                    logger.info(f"Single split calculated: Train=[{train_start}, {train_end}], Val=[{val_start}, {val_end}], Test=[{test_start}, {test_end}]")
+                    # Assign calculated dates to the fold_dates tuple for unpacking
+                    fold_dates = (train_start, train_end, val_start, val_end, test_start, test_end)
+                else:
+                    logger.error("Cannot calculate single split: Raw data not loaded or has incorrect index type.")
+                    raise SystemExit("Failed to calculate single data split.")
+            # --- Fix End ---
+
             train_start, train_end, val_start, val_end, test_start, test_end = fold_dates
             
-            # Select data for the current fold (Train+Val+Test periods combined for initial processing)
-            # Handle single split case where val/test dates might be None
+            # Setup fold-specific directories using IOManager if available
+            if self.io:
+                 # self.fold_dirs = self.io.setup_fold_dirs(self.current_fold)
+                 self.fold_dirs = self.io.get_fold_dirs(self.current_fold) # Correct method name
+            else:
+                 self.fold_dirs = {} # Set empty if no IOManager
+                 logger.warning(f"Fold {self.current_fold}: IOManager not available, cannot create fold-specific directories.")
+
+            # Select data for the current fold
             fold_start_date = train_start
-            fold_end_date = test_end if test_end is not None else train_end # Use train_end if single split
-            
-            # Ensure dates are timezone-aware if df_raw index is
+            fold_end_date = test_end if test_end is not None else train_end
             if self.df_raw.index.tz is not None:
                  fold_start_date = fold_start_date.tz_localize(self.df_raw.index.tz) if fold_start_date.tz is None else fold_start_date
                  fold_end_date = fold_end_date.tz_localize(self.df_raw.index.tz) if fold_end_date.tz is None else fold_end_date
 
             current_fold_data_raw = self.df_raw[fold_start_date:fold_end_date]
             if current_fold_data_raw.empty:
-                 logger.warning(f"Fold {fold_count}: No raw data found for range {fold_start_date} to {fold_end_date}. Skipping fold.")
+                 logger.warning(f"Fold {self.current_fold}: No raw data found for range {fold_start_date} to {fold_end_date}. Skipping fold.")
                  continue
             
-            logger.info(f"Fold {fold_count}: Raw data range [{current_fold_data_raw.index.min()}, {current_fold_data_raw.index.max()}]")
+            logger.info(f"Fold {self.current_fold}: Raw data range [{current_fold_data_raw.index.min()}, {current_fold_data_raw.index.max()}]")
 
             # --- Run Pipeline Steps within the Fold --- #
             try:
-                # a. Engineer Features for the fold's raw data
-                # Assuming engineer_features operates on the passed df or resets internal state
-                # We need to manage the dataframe state carefully here.
-                # Let's pass the data explicitly for clarity.
-                df_engineered_fold = self.engineer_features(current_fold_data_raw) 
+                # a. Engineer Features
+                df_engineered_fold = self.engineer_features(current_fold_data_raw)
+                if df_engineered_fold.empty:
+                     raise SystemExit(f"Fold {self.current_fold}: Feature engineering resulted in empty dataframe.")
 
-                # b. Define Labels and Align for the fold
-                df_labeled_aligned_fold, target_dir_col_fold, target_cols = self.define_labels_and_align(df_engineered_fold)
-                self.target_dir_col = target_dir_col_fold # Store for use in split
-                self.target_columns = target_cols
+                # b. Define Labels and Align
+                df_labeled_aligned_fold, _, _ = self.define_labels_and_align(df_engineered_fold)
+                if df_labeled_aligned_fold.empty:
+                     raise SystemExit(f"Fold {self.current_fold}: Label definition resulted in empty dataframe.")
 
-                # c. Split data *within* the fold (using dates or ratios)
+                # c. Split data
                 self.split_data(df_labeled_aligned_fold, fold_dates)
-                # Now self.X_train_raw, self.y_train etc. hold data for *this fold*
+                # self.X_train_raw, self.y_train etc. now hold data for *this fold*
 
-                # d. Baseline Check (Moved earlier - uses fold's raw train data)
-                logger.info(f"Fold {fold_count}: Running baseline check on raw/engineered features...")
-                self.run_baseline_checks(self.X_train_raw, self.y_dir_train)
-                logger.info(f"Fold {fold_count}: Baseline checks passed.")
+                # d. Scale Features (MUST happen before baseline checks)
+                self.scale_features() # Updates self.X_*_scaled, self.scaler, raises SystemExit on fail
 
-                # e. Select/Prune Features (Moved earlier)
-                # Selects based on self.X_train_raw, Prunes self.X_train_scaled -> self.X_train_pruned ...
-                logger.info(f"Fold {fold_count}: Selecting features (raw) & preparing for pruning...")
-                self.select_and_prune_features() 
+                # e. Baseline Check (NOW uses scaled data)
+                self.run_baseline_checks(self.current_fold) # Raises SystemExit on fail
 
-                # f. Scale Features (Now after baseline & selection)
-                logger.info(f"Fold {fold_count}: Scaling features...")
-                self.scale_features() 
+                # f. Select Features (Whitelist determined, uses raw train data internally)
+                self.select_and_prune_features() # Updates self.final_whitelist, raises SystemExit on fail
 
-                # g. Prune Features (Now applied to scaled data using prior selection)
-                # This step is now effectively done *within* select_and_prune_features
-                logger.info(f"Fold {fold_count}: Features pruned based on prior selection.")
+                # g. Prune Features (NOW uses scaled data and final_whitelist)
+                logger.info(f"--- Calling Stage: Pruning Features for Fold {self.current_fold} --- ")
+                X_train_pruned, X_val_pruned, X_test_pruned = prune_features_fold(
+                    X_train_scaled=self.X_train_scaled,
+                    X_val_scaled=self.X_val_scaled,
+                    X_test_scaled=self.X_test_scaled,
+                    final_whitelist=self.final_whitelist,
+                    feature_engineer=self.feature_engineer,
+                    fold_num=self.current_fold
+                )
+                # Update self state for pruned data
+                self.X_train_pruned = X_train_pruned
+                self.X_val_pruned = X_val_pruned
+                self.X_test_pruned = X_test_pruned
+                logger.info(f"Fold {self.current_fold}: Feature pruning stage complete.")
 
-                # h. Create Sequences for the fold (Uses pruned, scaled data)
-                logger.info(f"Fold {fold_count}: Creating sequences...")
-                self.create_sequences() # Uses self.X_train_pruned etc. -> self.X_train_seq ...
+                # h. Create Sequences (Uses pruned data)
+                self.create_sequences() # Uses self.X_*_pruned etc. -> self.X_*_seq ...
 
-                # i. Train/Load GRU Model for the fold
-                # TODO: Add Optuna sweep integration here if enabled
-                self.train_or_load_gru() # Uses self.X_train_seq etc.
+                # i. Train/Load GRU Model
+                self.train_or_load_gru() # Uses self.X_*_seq etc.
 
-                # j. Calibrate Probabilities for the fold
-                # TODO: Add rolling calibration logic here
-                self.calibrate_probabilities() # Uses self.X_val_seq etc. -> runs validation checks
+                # j. Calibrate Probabilities & Validate GRU
+                self.calibrate_probabilities() # Uses self.X_val_seq, runs validation checks, raises SystemExit on fail
 
-                # k. Train/Load SAC Agent for the fold
+                # j.2. Perform GRU Validation Checks (New Step)
+                # This uses the p_cal_val_for_check and y_dir_val_for_check stored by calibrate_probabilities
+                run_gru_validation_checks_fold(
+                    config=self.config,
+                    current_fold=self.current_fold,
+                    p_cal_val=getattr(self, 'p_cal_val_for_check', None), # Use getattr for safety
+                    y_dir_val=getattr(self, 'y_dir_val_for_check', None),
+                    optimized_edge_threshold=self.optimized_edge_threshold,
+                    use_ternary=self.use_ternary,
+                    io=self.io
+                )
+                # Note: run_gru_validation_checks_fold raises SystemExit on failure
+
+                # k. Train/Load SAC Agent
                 self.train_or_load_sac() # Uses artifacts from GRU step
 
-                # l. Run Backtest for the fold (on fold's test set)
-                # Note: Backtest gate failure no longer causes SystemExit immediately
-                self.run_backtest() # Uses self.X_test_seq etc. -> runs validation checks
+                # l. Run Backtest
+                self.run_backtest() # Uses self.X_test_seq, runs validation checks, raises SystemExit on fail
 
                 # m. Persist Fold Artefacts & Store Metrics
-                # Example: self.io.save_fold_results(fold_count, self.backtest_metrics)
-                logger.info(f"Storing metrics for Fold {fold_count}")
-                # Store metrics regardless of backtest success/failure for aggregation
+                logger.info(f"Storing metrics for Fold {self.current_fold}")
                 if self.backtest_metrics is not None:
                      fold_metrics = self.backtest_metrics.copy()
-                     fold_metrics['fold_number'] = fold_count
+                     fold_metrics['fold_number'] = self.current_fold
                      fold_metrics['train_start'] = train_start.isoformat() if train_start else None
                      fold_metrics['train_end'] = train_end.isoformat() if train_end else None
                      fold_metrics['val_start'] = val_start.isoformat() if val_start else None
                      fold_metrics['val_end'] = val_end.isoformat() if val_end else None
                      fold_metrics['test_start'] = test_start.isoformat() if test_start else None
                      fold_metrics['test_end'] = test_end.isoformat() if test_end else None
-                     # Add a status field based on whether metrics seem valid (e.g., Sharpe exists)
-                     # Backtester should ideally return a status or use specific metrics keys
-                     if 'Annualized Sharpe Ratio' in fold_metrics and not pd.isna(fold_metrics['Annualized Sharpe Ratio']): 
-                         fold_metrics['status'] = 'success' # Assume success if Sharpe exists
+                     # Determine status based on metrics
+                     if 'Annualized Sharpe Ratio' in fold_metrics and not pd.isna(fold_metrics['Annualized Sharpe Ratio']):
+                         fold_metrics['status'] = 'success'
                      else:
-                         fold_metrics['status'] = 'failed_backtest' # Assume backtest failed if Sharpe is missing/NaN
-                     all_fold_metrics.append(fold_metrics)
+                         fold_metrics['status'] = 'failed_backtest'
+                     self.all_fold_metrics.append(fold_metrics)
                 else:
-                      logger.warning(f"Fold {fold_count}: No backtest metrics generated to store.")
-                      # Still store a failure record for aggregation count
-                      all_fold_metrics.append({'fold_number': fold_count, 'status': 'failed_backtest', 'error': 'No metrics returned'}) 
+                     logger.warning(f"Fold {self.current_fold}: No backtest metrics generated to store.")
+                     self.all_fold_metrics.append({'fold_number': self.current_fold, 'status': 'failed_backtest', 'error': 'No metrics returned'})
 
-                # --- Store SAC Agent Path if Trained --- #
+                # Store SAC Agent Path if Trained Successfully for this fold
                 if self.config.get('control', {}).get('train_sac', False) and self.sac_agent_load_path:
-                     # Check if the path corresponds to a newly trained agent for this fold
-                     # We assume self.sac_agent_load_path holds the path to the agent used/trained in this fold
-                     if os.path.exists(self.sac_agent_load_path):
-                          logger.info(f"Fold {self.current_fold}: Storing SAC agent path for aggregation: {self.sac_agent_load_path}")
-                          all_successful_sac_agent_paths.append(self.sac_agent_load_path)
-                     else:
-                          logger.warning(f"Fold {self.current_fold}: Trained SAC agent path {self.sac_agent_load_path} not found after training. Cannot use for aggregation.")
-                # --- End Store --- #
+                     # Assuming sac_agent_load_path points to the *newly* trained agent if training occurred
+                     if hasattr(self, 'sac_trainer') and self.sac_trainer and self.sac_trainer.last_saved_agent_path == self.sac_agent_load_path:
+                          if os.path.exists(self.sac_agent_load_path):
+                               logger.info(f"Fold {self.current_fold}: Storing successfully trained SAC agent path for aggregation: {self.sac_agent_load_path}")
+                               all_successful_sac_agent_paths.append(self.sac_agent_load_path)
+                          else:
+                               logger.warning(f"Fold {self.current_fold}: SAC training reported success, but path {self.sac_agent_load_path} not found.")
+                     elif self.sac_agent_load_path: # If path exists but wasn't from training this fold
+                          pass # Don't add loaded agents to the aggregation list unless explicitly intended
 
-            except SystemExit as e: # Catch exits from validation gates (Baseline, GRU)
-                 logger.error(f"Fold {fold_count} failed validation gate: {e}. Skipping to next fold.")
-                 # Store failure information? For now, just continue.
-                 # Check if it's a Baseline or GRU gate failure based on message?
-                 gate_type = 'gru_gate' if 'GRU validation' in str(e) else 'baseline_gate'
-                 all_fold_metrics.append({'fold_number': fold_count, 'status': f'failed_{gate_type}', 'error': str(e)})
-                 continue
+            except SystemExit as e:
+                 logger.error(f"Fold {self.current_fold} processing halted: {e}. Skipping to next fold.")
+                 # Determine gate type for reporting
+                 gate_type = 'unknown_gate'
+                 if 'Baseline CI' in str(e) or 'BASELINE CHECK' in str(e): gate_type = 'baseline_gate'
+                 elif 'GRU validation' in str(e) or 'Edge Acc' in str(e) or 'Brier' in str(e): gate_type = 'gru_gate'
+                 elif 'Backtest failed' in str(e) or 'BACKTEST CHECK' in str(e) or 'Sharpe' in str(e): gate_type = 'backtest_gate'
+                 elif 'split' in str(e).lower(): gate_type = 'split_error'
+                 elif 'selection' in str(e).lower(): gate_type = 'selection_error'
+                 elif 'scaling' in str(e).lower(): gate_type = 'scaling_error'
+                 elif 'pruning' in str(e).lower(): gate_type = 'pruning_error'
+                 elif 'sequence' in str(e).lower(): gate_type = 'sequence_error'
+                 elif 'label' in str(e).lower(): gate_type = 'labeling_error'
+                 elif 'feature eng' in str(e).lower(): gate_type = 'feature_eng_error'
+                 self.all_fold_metrics.append({'fold_number': self.current_fold, 'status': f'failed_{gate_type}', 'error': str(e)})
+                 continue # Skip to next fold
             except Exception as e:
-                 logger.error(f"Error processing Fold {fold_count}: {e}. Skipping fold.", exc_info=True)
-                 all_fold_metrics.append({'fold_number': fold_count, 'status': 'error', 'error': str(e)}) 
-                 continue # Skip to the next fold on error
+                 logger.error(f"Unexpected error processing Fold {self.current_fold}: {e}. Skipping fold.", exc_info=True)
+                 self.all_fold_metrics.append({'fold_number': self.current_fold, 'status': 'error', 'error': str(e)})
+                 continue # Skip to next fold
 
             # --- End Fold Loop --- #
-            logger.info(f"=== Finished Processing Fold {fold_count} ===")
+            logger.info(f"=== Finished Processing Fold {self.current_fold} ===")
             # If only single split, break after first iteration
             if not self.config.get('walk_forward', {}).get('enabled', False):
+                 logger.info("Single split processing complete. Exiting fold loop.")
                  break 
 
-        # 4. Aggregate Fold Metrics
-        release_decision_passed = False # Initialize decision to False
-        successful_fold_nums = [f.get('fold_number') for f in all_fold_metrics if f.get('status', 'success') == 'success']
-        if all_fold_metrics:
-            self.aggregated_metrics = self.aggregate_fold_metrics(all_fold_metrics)
+        # 4. Aggregate Fold Metrics & Final Decision
+        release_decision_passed = False
+        if self.all_fold_metrics:
+            self.aggregated_metrics = self.aggregate_fold_metrics(self.all_fold_metrics)
             logger.info("--- Aggregated Walk-Forward Metrics --- ")
-            for key, value in self.aggregated_metrics.items():
-                 logger.info(f"  {key}: {value}")
-            # Save aggregated metrics
+            # Use json dumps for pretty printing dict/nested dict
+            logger.info(json.dumps(self.aggregated_metrics, indent=2))
             if self.io:
                  self.io.save_json(self.aggregated_metrics, 'aggregated_wf_metrics', section='results')
-            
-            # 5. Make Final Release Decision (only if aggregation occurred)
-            release_decision_passed = self.final_release_decision(self.aggregated_metrics) 
-        else:
-            logger.warning("No fold metrics to aggregate. Skipping aggregation and final decision.")
-            self.aggregated_metrics = {} # Ensure it's defined, even if empty
 
-        # 6. Log Final Status based on the decision
+            release_decision_passed = self.final_release_decision(self.aggregated_metrics)
+        else:
+            logger.warning("No fold metrics were generated. Skipping aggregation and final decision.")
+            self.aggregated_metrics = {} # Ensure defined
+
+        # Log Final Status
         if release_decision_passed:
-             logger.info("--- Pipeline finished successfully and meets release criteria. ---")
+             logger.info(f"--- Pipeline Run {self.run_id} finished successfully and meets release criteria. ---")
         else:
-             logger.error("--- Pipeline finished but FAILED to meet release criteria. ---")
+             logger.error(f"--- Pipeline Run {self.run_id} finished but FAILED to meet release criteria. See aggregated metrics and logs. ---")
 
-        # 5. Aggregate SAC Agents (if enabled and successful folds exist)
+        # 5. Aggregate SAC Agents (Optional)
         if self.config.get('sac_aggregation', {}).get('enabled', False):
              if all_successful_sac_agent_paths:
                   self.aggregate_sac_agents(all_successful_sac_agent_paths)
@@ -2646,115 +968,561 @@ class TradingPipeline:
     # --- SAC Agent Aggregation --- #
     def aggregate_sac_agents(self, agent_paths: List[str]):
         """
-        Aggregates SAC agents from a list of saved agent paths.
-        Currently supports averaging weights.
+        Aggregates SAC agents by calling the stage function.
 
         Args:
             agent_paths (List[str]): List of paths to the saved SAC agent directories 
-                                     (e.g., [.../sac_agent_final, ...]).
+                                     from successful folds.
         """
-        agg_cfg = self.config.get('sac_aggregation', {})
-        # Enabled check is done before calling, but double-check
-        if not agg_cfg.get('enabled', False):
-            logger.info("SAC agent aggregation is disabled. Skipping.")
+        logger.info(f"--- Calling Stage: Aggregating SAC Agents ---")
+        # Call the stage function from modelling.py
+        aggregate_sac_agents(
+            config=self.config,
+            agent_paths=agent_paths,
+            current_run_models_dir=self.current_run_models_dir,
+            io=self.io
+        )
+        logger.info(f"--- SAC Agent Aggregation Stage Call Complete ---")
+
+    def select_and_prune_features(self):
+        """Performs feature selection for the fold by calling the stage function.
+
+        Note: Pruning is handled in a separate step after scaling.
+        """
+        logger.info(f"--- Calling Stage: Selecting Features for Fold {self.current_fold} --- ")
+
+        # Determine fold-specific models directory path
+        fold_models_dir = self.fold_dirs.get('models')
+        fold_results_dir = self.fold_dirs.get('results') # Pass results dir too, though not used by select currently
+        if not fold_models_dir:
+            logger.error(f"Fold {self.current_fold}: Cannot select features, fold models directory not set.")
+            raise SystemExit(f"Fold {self.current_fold}: Missing models directory for feature selection.")
+
+        # Call the stage function to perform selection and save whitelist
+        # Note: This function raises SystemExit on failure
+        final_whitelist = select_features_fold(
+            X_train_raw=self.X_train_raw,
+            y_dir_train_ordinal=self.y_dir_train_ordinal, # Pass ordinal labels
+            feature_engineer=self.feature_engineer,
+            io=self.io,
+            run_id=self.run_id,
+            fold_num=self.current_fold,
+            fold_models_dir=fold_models_dir,
+            fold_results_dir=fold_results_dir,
+            main_run_models_dir=self.current_run_models_dir # <<< PASS the main run models dir
+        )
+
+        # Store the determined whitelist on self
+        self.final_whitelist = final_whitelist
+
+        logger.info(f"Fold {self.current_fold}: Feature selection stage complete. Whitelist stored.")
+        # Pruning logic is removed from here.
+
+    def scale_features(self):
+        """Scales features for the current fold by calling the stage function."""
+        logger.info(f"--- Calling Stage: Scaling Features for Fold {self.current_fold} --- ")
+
+        # Determine fold-specific models directory path
+        fold_models_dir = self.fold_dirs.get('models')
+        if not fold_models_dir:
+            logger.error(f"Fold {self.current_fold}: Cannot scale features, fold models directory not set.")
+            raise SystemExit(f"Fold {self.current_fold}: Missing models directory for scaling.")
+
+        # Call the stage function
+        # Note: This function raises SystemExit on failure
+        X_train_scaled, X_val_scaled, X_test_scaled, used_scaler = scale_features_fold(
+            X_train_raw=self.X_train_raw,
+            X_val_raw=self.X_val_raw,
+            X_test_raw=self.X_test_raw,
+            run_id=self.run_id,
+            fold_num=self.current_fold,
+            fold_models_dir=fold_models_dir,
+            main_run_models_dir=self.current_run_models_dir, # <<< PASS the main run models dir
+            preloaded_scaler=self.scaler # Pass preloaded scaler if it exists (from previous fold or loaded model)
+        )
+
+        # Update self state
+        self.X_train_scaled = X_train_scaled
+        self.X_val_scaled = X_val_scaled
+        self.X_test_scaled = X_test_scaled
+        self.scaler = used_scaler # Store the scaler used (might be newly fitted or preloaded)
+
+        logger.info(f"Fold {self.current_fold}: Feature scaling stage complete.")
+
+    def create_sequences(self):
+        """Creates sequences from pruned features via stage function."""
+        logger.info("--- Stage: Creating Sequences ---")
+        if self.X_train_pruned is None or self.y_train is None or \
+           self.X_val_pruned is None or self.y_val is None or \
+           self.X_test_pruned is None or self.y_test is None:
+            logger.error("Pruned training, validation, or test data/targets are missing. Cannot create sequences.")
             return
-            
-        if not agent_paths:
-             logger.warning("No SAC agent paths provided for aggregation. Skipping.")
-             return
-             
-        method = agg_cfg.get('method', 'average_weights')
-        if method != 'average_weights':
-            logger.warning(f"SAC aggregation method '{method}' is not implemented. Skipping.")
-            return
-            
-        logger.info(f"Starting SAC agent aggregation using method: {method} from {len(agent_paths)} agents.")
         
-        all_state_dicts = []
-        loaded_agent_dims = {'state': None, 'action': None}
+        gru_cfg = self.config.get('gru', {})
+        lookback = gru_cfg.get('lookback', 60)
+        use_ternary = gru_cfg.get('use_ternary', False)
+        drop_imputed = gru_cfg.get('drop_imputed_sequences', False)
 
-        # --- Load Fold Agents and Check Dimensions --- #
-        for agent_path in agent_paths:
-            if os.path.exists(agent_path):
-                logger.info(f"  Loading agent from: {agent_path}")
-                try:
-                    # Initialize a temporary agent to load into
-                    # Get dimensions from metadata if possible, else use fallback
-                    # Load metadata first
-                    meta_path = os.path.join(agent_path, 'agent_metadata.json')
-                    metadata = {}
-                    if os.path.exists(meta_path):
-                        with open(meta_path, 'r') as f:
-                            metadata = json.load(f)
-                    
-                    # Determine state/action dims
-                    state_dim = metadata.get('state_dim', 5) # Use 5 as fallback
-                    action_dim = metadata.get('action_dim', 1) # Use 1 as fallback
+        # Determine target columns based on ternary setting
+        dir_key = 'dir3' if use_ternary else 'dir'
+        target_names = ['mu', dir_key]
 
-                    # Check for dimension consistency
-                    if loaded_agent_dims['state'] is None:
-                        loaded_agent_dims['state'] = state_dim
-                        loaded_agent_dims['action'] = action_dim
-                    elif (loaded_agent_dims['state'] != state_dim or 
-                          loaded_agent_dims['action'] != action_dim):
-                         logger.warning(f"  Dimension mismatch! Agent {agent_path} has dims ({state_dim},{action_dim}), expected ({loaded_agent_dims['state']},{loaded_agent_dims['action']}). Skipping this agent.")
-                         continue # Skip this agent
+        # Check if 'bar_imputed' exists in the dataframes
+        if 'bar_imputed' not in self.X_train_pruned.columns or \
+           'bar_imputed' not in self.X_val_pruned.columns or \
+           'bar_imputed' not in self.X_test_pruned.columns:
+            logger.error("'bar_imputed' column not found in pruned data. Cannot create sequences with imputed handling.")
+            # Decide whether to proceed without it or raise error
+            # For now, raising an error as it's required by the instructions
+            raise ValueError("'bar_imputed' column is missing from feature dataframes before sequence creation.")
 
-                    agent_temp = SACTradingAgent(state_dim=state_dim, action_dim=action_dim, **self.sac_cfg)
-                    loaded_meta_check = agent_temp.load(agent_path) # Load weights and metadata
-                    if not loaded_meta_check: # Check if load method indicated failure
-                         raise RuntimeError(f"Agent load method failed for {agent_path}")
-                    all_state_dicts.append(agent_temp.get_state_dict()) # Get state dict
-                except Exception as e:
-                    logger.warning(f"  Failed to load or get state dict from agent at {agent_path}: {e}")
-            else:
-                logger.warning(f"  Agent path not found: {agent_path}")
-                
-        if not all_state_dicts:
-             logger.error("Failed to load any valid SAC agents. Cannot aggregate.")
-             return
+        # Use the stage function for sequence creation
+        results_train = create_sequences_fold(
+            X_data=self.X_train_pruned,
+            y_data=self.y_train, 
+            target_names=target_names,
+            lookback=lookback,
+            name="Train",
+            config=self.config, # Pass config for drop_imputed_sequences
+            io=self.io # Pass IOManager for artefact saving
+        )
+        results_val = create_sequences_fold(
+            X_data=self.X_val_pruned,
+            y_data=self.y_val,
+            target_names=target_names,
+            lookback=lookback,
+            name="Validation",
+            config=self.config,
+            io=self.io
+        )
+        results_test = create_sequences_fold(
+            X_data=self.X_test_pruned,
+            y_data=self.y_test,
+            target_names=target_names,
+            lookback=lookback,
+            name="Test",
+            config=self.config,
+            io=self.io
+        )
+
+        # Unpack results and store them
+        if results_train:
+            self.X_train_seq, self.y_train_seq_dict, self.train_seq_indices, _ = results_train
+        else:
+             logger.error("Failed to create training sequences.")
+             # Handle error appropriately - perhaps stop the pipeline
+             return 
+        
+        if results_val:
+             self.X_val_seq, self.y_val_seq_dict, self.val_seq_indices, _ = results_val
+        else:
+             logger.error("Failed to create validation sequences.")
+             return 
              
-        logger.info(f"Successfully loaded {len(all_state_dicts)} consistent SAC agent state dictionaries for aggregation.")
+        if results_test:
+            self.X_test_seq, self.y_test_seq_dict, self.test_seq_indices, _ = results_test
+        else:
+            logger.error("Failed to create test sequences.")
+            return
+
+        logger.info("Sequence creation complete for all data splits.")
+
+    def train_or_load_gru(self):
+        """Trains/loads GRU model for the fold via stage function & handles re-sequencing."""
+        logger.info(f"--- Calling Stage: Training/Loading GRU for Fold {self.current_fold} ---")
+
+        # Determine fold-specific models directory path
+        fold_models_dir = self.fold_dirs.get('models', self.current_run_models_dir) # Fallback to run models dir
+        if not fold_models_dir:
+            logger.error(f"Fold {self.current_fold}: Cannot train/load GRU, fold models directory not set.")
+            raise SystemExit(f"Fold {self.current_fold}: Missing models directory for GRU stage.")
+
+        # Call the stage function
+        # Expects the stage function to handle re-pruning and re-sequencing internally
+        # if re-scaling occurs, and return the final sequences.
+        (
+            gru_model, gru_handler,
+            gru_model_run_id_loaded_from,
+            scaler_maybe_updated,
+            # Potentially updated sequences if re-scaling occurred inside stage fn
+            X_train_seq_new, y_train_seq_dict_new, train_indices_new,
+            X_val_seq_new, y_val_seq_dict_new, val_indices_new,
+            X_test_seq_new, y_test_seq_dict_new, test_indices_new
+        ) = train_or_load_gru_fold(
+            config=self.config,
+            run_id=self.run_id, # Pass current run_id (used for saving trained models)
+            current_fold=self.current_fold, # Pass current fold number
+            current_run_models_dir=fold_models_dir, # Pass fold-specific dir
+            base_models_dir_path=self.base_models_dir_path,
+            gru_handler=self.gru_handler, # Pass the handler instance
+            # Pass current sequences
+            X_train_seq=self.X_train_seq,
+            y_train_seq_dict=self.y_train_seq_dict,
+            X_val_seq=self.X_val_seq,
+            y_val_seq_dict=self.y_val_seq_dict,
+            X_test_seq=self.X_test_seq, # Pass test sequences too
+            y_test_seq_dict=self.y_test_seq_dict,
+            # Pass raw data needed for potential re-processing
+            X_train_raw=self.X_train_raw,
+            X_val_raw=self.X_val_raw,
+            X_test_raw=self.X_test_raw,
+            y_train=self.y_train,
+            y_val=self.y_val,
+            y_test=self.y_test,
+            # Remove pruned data - not expected by the stage function
+            # X_train_pruned=self.X_train_pruned, # Needed if re-sequencing happens
+            # X_val_pruned=self.X_val_pruned,
+            # X_test_pruned=self.X_test_pruned,
+            scaler=self.scaler, # Pass the current scaler
+            final_whitelist=self.final_whitelist, # Pass whitelist for potential re-pruning
+            feature_engineer=self.feature_engineer, # Pass FeatureEngineer instance
+            io=self.io
+        )
+
+        # Update self state with results from the stage function
+        self.gru_model = gru_model
+        self.gru_handler = gru_handler # Handler might be updated (e.g., after tuning)
+        self.gru_model_run_id_loaded_from = gru_model_run_id_loaded_from
+        self.scaler = scaler_maybe_updated # Scaler might have been loaded
+
+        # Check if stage function returned updated sequences
+        # (Indicates re-scaling/re-pruning/re-sequencing occurred internally)
+        if X_train_seq_new is not None:
+            logger.info(f"Fold {self.current_fold}: GRU loading triggered internal re-processing. Updating sequence data.")
+            self.X_train_seq = X_train_seq_new
+            self.y_train_seq_dict = y_train_seq_dict_new
+            self.train_indices = train_indices_new
+            self.X_val_seq = X_val_seq_new
+            self.y_val_seq_dict = y_val_seq_dict_new
+            self.val_indices = val_indices_new
+            self.X_test_seq = X_test_seq_new
+            self.y_test_seq_dict = y_test_seq_dict_new
+            self.test_indices = test_indices_new
+        # else: sequences remain as they were before calling the stage function.
+
+        logger.info(f"Fold {self.current_fold}: GRU Training/Loading stage complete. Model run ID: {self.gru_model_run_id_loaded_from}")
+
+    def calibrate_probabilities(self):
+        """Calibrates GRU probs & optimizes edge threshold for the fold via stage function."""
+        logger.info(f"--- Calling Stage: Calibrating Probabilities for Fold {self.current_fold} ---")
+
+        # Call the stage function
+        (
+            optimal_T, vector_cal_params,
+            optimized_edge_threshold,
+            p_cal_val_for_check, y_dir_val_for_check
+        ) = calibrate_probabilities_fold(
+            config=self.config,
+            current_fold=self.current_fold,
+            gru_model=self.gru_model,
+            gru_handler=self.gru_handler,
+            X_val_seq=self.X_val_seq,
+            y_val_seq_dict=self.y_val_seq_dict,
+            use_ternary=self.use_ternary,
+            calibrator=self.calibrator,
+            vector_calibrator=self.vector_calibrator,
+            fold_dirs=self.fold_dirs,
+            current_run_models_dir=self.current_run_models_dir,
+            run_id=self.run_id, # <<< PASS run_id
+            io=self.io
+        )
+
+        # Update self state with the results
+        self.optimal_T = optimal_T
+        self.vector_cal_params = vector_cal_params
+        self.optimized_edge_threshold = optimized_edge_threshold
+        # Store predictions/labels needed for the separate validation check stage
+        self.p_cal_val_for_check = p_cal_val_for_check
+        self.y_dir_val_for_check = y_dir_val_for_check
+
+        # REMOVED: Internal call to _perform_gru_validation_checks.
+        # This check will be performed by a dedicated evaluation stage function later.
+
+        logger.info(f"Fold {self.current_fold}: Probability calibration stage complete. Optimized Edge: {self.optimized_edge_threshold}")
+
+    def train_or_load_sac(self):
+        """Trains/loads SAC agent for the fold via stage function."""
+        logger.info(f"--- Calling Stage: Training/Loading SAC Agent for Fold {self.current_fold} ---")
+        # --- Corrected Config Lookup --- #
+        train_sac_flag = self.config.get('sac', {}).get('train_sac', False) # Correct path
+        # --- End Correction --- #
+        sac_trainer_instance = None # Initialize as None
+
+        if train_sac_flag:
+            # --- Instantiate SACTrainer only if training --- #
+            # --- DEBUG: Check value of gru_model_run_id_loaded_from --- #
+            logger.info(f"DEBUG: Checking gru_model_run_id_loaded_from: {self.gru_model_run_id_loaded_from}")
+            # --- END DEBUG --- #
+            if self.gru_model_run_id_loaded_from is None:
+                logger.error(f"Fold {self.current_fold}: Cannot instantiate SACTrainer: GRU model run ID is not set. Aborting SAC stage.")
+                self.sac_agent_load_path = None # Ensure path is None
+                return # Skip stage
+
+            logger.info(f"Fold {self.current_fold}: SAC training enabled. Instantiating SACTrainer...")
+
+            # Determine Edge Threshold for SAC Trainer config
+            edge_threshold_for_sac = self.optimized_edge_threshold if self.optimized_edge_threshold is not None else \
+                                     self.config.get('calibration', {}).get('edge_threshold', 0.1)
+            logger.info(f"Using edge threshold {edge_threshold_for_sac:.4f} for SAC Trainer (heuristic seeding / env info)...")
+
+            # Prepare Config copy for SAC Trainer
+            sac_trainer_config = self.config.copy()
+            if 'calibration' not in sac_trainer_config: sac_trainer_config['calibration'] = {}
+            sac_trainer_config['calibration']['edge_threshold'] = edge_threshold_for_sac
+            if sac_trainer_config.get('calibration', {}).get('rolling_enabled', False):
+                logger.warning(f"Fold {self.current_fold}: SAC training enabled AND rolling calib enabled. Disabling rolling calib for SAC trainer config copy.")
+                sac_trainer_config['calibration']['rolling_enabled'] = False
+
+            # Determine base dirs for SACTrainer
+            base_logs = self.dirs.get('logs')
+            if not base_logs or not os.path.isdir(base_logs):
+                 # Use project root as fallback - assumes specific structure
+                 project_root_guess = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) # ../.. from src/
+                 base_logs = os.path.join(project_root_guess, 'logs')
+                 os.makedirs(base_logs, exist_ok=True)
+                 logger.warning(f"Fold {self.current_fold}: Using fallback base logs dir for SACTrainer: {base_logs}")
+
+            base_results = self.dirs.get('results')
+            if not base_results or not os.path.isdir(base_results):
+                 project_root_guess = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+                 base_results = os.path.join(project_root_guess, 'results')
+                 os.makedirs(base_results, exist_ok=True)
+                 logger.warning(f"Fold {self.current_fold}: Using fallback base results dir for SACTrainer: {base_results}")
 
-        # --- Average Weights --- #
-        if method == 'average_weights':
             try:
-                avg_state_dict = OrderedDict()
-                keys = all_state_dicts[0].keys()
-                num_agents = len(all_state_dicts)
-                
-                for key in keys:
-                    summed_tensor = torch.stack([sd[key] for sd in all_state_dicts], dim=0).sum(dim=0)
-                    avg_state_dict[key] = summed_tensor / num_agents
-                
-                logger.info("Successfully averaged agent weights.")
-
-                # --- Save Averaged Agent --- #
-                # Use the dimensions determined during loading
-                final_state_dim = loaded_agent_dims['state']
-                final_action_dim = loaded_agent_dims['action']
-                final_agent = SACTradingAgent(state_dim=final_state_dim, action_dim=final_action_dim, **self.sac_cfg)
-                final_agent.load_state_dict(avg_state_dict)
-                
-                save_dir_name = 'sac_agent_aggregated'
-                aggregated_agent_save_path = os.path.join(self.current_run_models_dir, save_dir_name)
-                os.makedirs(aggregated_agent_save_path, exist_ok=True) 
-                
-                # Use the agent's save method
-                final_agent.save(aggregated_agent_save_path) 
-                logger.info(f"Saved aggregated SAC agent to: {aggregated_agent_save_path}")
-                
-                agg_info = {
-                    'aggregation_method': method,
-                    'num_agents_aggregated': num_agents,
-                    'source_agent_paths': agent_paths, # Log the paths used
-                    'save_path': aggregated_agent_save_path
-                }
-                if self.io:
-                     self.io.save_json(agg_info, 'sac_aggregation_info', section='results', use_txt=True)
-                     
+                # Instantiate SACTrainer - ensure SACTrainer class is imported
+                from gru_sac_predictor.src.sac_trainer import SACTrainer # Ensure import
+                sac_trainer_instance = SACTrainer(
+                    config=sac_trainer_config,
+                    base_models_dir=self.base_models_dir_path,
+                    base_logs_dir=base_logs,
+                    base_results_dir=base_results
+                )
+                self.sac_trainer = sac_trainer_instance # Store instance if needed later (e.g., aggregation)
+            except ImportError:
+                 logger.error(f"Fold {self.current_fold}: Failed to import SACTrainer. Cannot instantiate trainer.")
+                 self.sac_agent_load_path = None
+                 return # Skip stage
             except Exception as e:
-                 logger.error(f"Error during SAC agent weight averaging or saving: {e}", exc_info=True)
+                 logger.error(f"Fold {self.current_fold}: Failed to instantiate SACTrainer: {e}", exc_info=True)
+                 self.sac_agent_load_path = None
+                 return # Skip stage
+            # --- End SACTrainer Instantiation ---
+
+        # Call the stage function, passing the trainer instance (if created) or None
+        sac_agent_load_path_result = train_or_load_sac_fold(
+            config=self.config, # Pass original config to stage function
+            current_fold=self.current_fold,
+            gru_model_run_id_loaded_from=self.gru_model_run_id_loaded_from,
+            base_models_dir_path=self.base_models_dir_path,
+            sac_trainer=sac_trainer_instance, # Pass the instance or None
+            io=self.io
+        )
+
+        # Store the determined path on self
+        self.sac_agent_load_path = sac_agent_load_path_result
+
+        logger.info(f"Fold {self.current_fold}: SAC Training/Loading stage completed in pipeline. Agent path set to: {self.sac_agent_load_path}")
+
+    def run_backtest(self):
+        """Runs the backtest for the fold by calling the stage function."""
+        logger.info(f"--- Calling Stage: Running Backtest for Fold {self.current_fold} ---")
+
+        # Reset metrics for the fold
+        self.backtest_results_df = None
+        self.backtest_metrics = None
+        self.metrics_log_df = None
+
+        # --- Gather necessary inputs from self state --- #
+        # Specifically handle raw predictions needed ONLY if rolling calibration is enabled
+        p_raw_test_input = None
+        logits_test_input = None
+        rolling_cal_enabled = self.config.get('calibration', {}).get('rolling_enabled', False)
+
+        if rolling_cal_enabled:
+            logger.info(f"Fold {self.current_fold}: Rolling calibration enabled. Attempting to generate raw GRU predictions for test set.")
+            if self.gru_handler and self.X_test_seq is not None:
+                try:
+                    if self.use_ternary:
+                        logger.info("Generating raw logits for ternary case...")
+                        logits_test_input = self.gru_handler.predict_logits(self.X_test_seq)
+                        if logits_test_input is None:
+                             logger.error("GRU handler failed to return logits.")
+                        else:
+                             logger.info(f"Generated logits_test with shape: {logits_test_input.shape}")
+                    else:
+                        logger.info("Generating raw probabilities P(up) for binary case...")
+                        # Assuming predict returns tuple: (calibrated_probs, raw_probs_or_none, ...)
+                        # Or adjust based on actual gru_handler.predict signature if it only returns raw probs
+                        preds_tuple = self.gru_handler.predict(self.X_test_seq) # This might need adjustment based on predict signature
+                        # Check if predict returns raw probs directly or in a tuple
+                        if isinstance(preds_tuple, tuple) and len(preds_tuple) >= 3 and preds_tuple[2] is not None:
+                            # Assuming raw P(up) is the 3rd element as per previous logic
+                            p_raw_test_input = preds_tuple[2].flatten()
+                            logger.info(f"Generated p_raw_test with shape: {p_raw_test_input.shape}")
+                        elif isinstance(preds_tuple, np.ndarray): # If predict *only* returns raw P(up)
+                             p_raw_test_input = preds_tuple.flatten()
+                             logger.info(f"Generated p_raw_test (direct return) with shape: {p_raw_test_input.shape}")
+                        else:
+                            # Fallback: Try predict_proba if available and predict doesn't give raw probs
+                            if hasattr(self.gru_handler, 'predict_proba'):
+                                 logger.info("Using predict_proba as fallback for raw P(up)...")
+                                 p_raw_all_classes = self.gru_handler.predict_proba(self.X_test_seq) # Assumes returns (N, 2)
+                                 if p_raw_all_classes is not None and p_raw_all_classes.ndim == 2 and p_raw_all_classes.shape[1] == 2:
+                                      p_raw_test_input = p_raw_all_classes[:, 1] # Get P(up)
+                                      logger.info(f"Generated p_raw_test (from predict_proba) with shape: {p_raw_test_input.shape}")
+                                 else:
+                                      logger.error("GRU handler predict_proba did not return expected format.")
+                            else:
+                                logger.error("GRU handler predict method did not return expected raw probabilities, and predict_proba is not available.")
+
+                except Exception as e:
+                     logger.error(f"Error generating raw predictions via GRU handler: {e}", exc_info=True)
+                     # Continue without raw predictions, but log the error
+
+            # Final check if required inputs for rolling cal were obtained
+            if self.use_ternary and logits_test_input is None:
+                 logger.error(f"Fold {self.current_fold}: Failed to get raw GRU logits needed for rolling calibration. Backtest cannot proceed with rolling cal.")
+                 raise SystemExit(f"Fold {self.current_fold}: Missing raw GRU logits for rolling calibration.")
+            elif not self.use_ternary and p_raw_test_input is None:
+                 logger.error(f"Fold {self.current_fold}: Failed to get raw GRU probabilities needed for rolling calibration. Backtest cannot proceed with rolling cal.")
+                 raise SystemExit(f"Fold {self.current_fold}: Missing raw GRU probabilities P(up) for rolling calibration.")
+        # --- End raw prediction handling --- #
+
+        try:
+            results_df, metrics_dict, metrics_log = run_backtest_fold(
+                config=self.config,
+                io=self.io,
+                current_fold=self.current_fold,
+                fold_dirs=self.fold_dirs,
+                sac_agent_load_path=self.sac_agent_load_path,
+                X_test_seq=self.X_test_seq,
+                y_test_seq_dict=self.y_test_seq_dict,
+                test_indices=self.test_indices,
+                df_test_original=self.df_test_original,
+                gru_handler=self.gru_handler,
+                calibrator=self.calibrator,
+                vector_calibrator=self.vector_calibrator,
+                initial_optimal_T=getattr(self, 'optimal_T', None),
+                initial_vector_params=getattr(self, 'vector_cal_params', None),
+                optimized_edge_threshold=self.optimized_edge_threshold, # Use the value stored from calibration
+                p_raw_test=p_raw_test_input,
+                logits_test=logits_test_input,
+                use_ternary=self.use_ternary
+            )
+
+            # Store results on self
+            self.backtest_results_df = results_df
+            self.backtest_metrics = metrics_dict
+            self.metrics_log_df = metrics_log
+
+            logger.info(f"Fold {self.current_fold}: Backtest stage completed successfully.")
+
+        except SystemExit as e:
+            logger.error(f"Fold {self.current_fold} failed backtest validation gates: {e}. Halting fold.")
+            self.backtest_metrics = None # Ensure metrics are None on failure
+            raise # Re-raise SystemExit to stop fold processing in execute()
+        except Exception as e:
+            logger.error(f"Fold {self.current_fold}: An unexpected error occurred during the backtest stage: {e}", exc_info=True)
+            self.backtest_metrics = None # Ensure metrics are None on failure
+            # Raise SystemExit to stop the fold
+            raise SystemExit(f"Fold {self.current_fold}: Unhandled exception in backtest stage.") from e
+
+    # --- Walk-Forward Fold Generation --- #
+    def _generate_walk_forward_folds(self) -> Iterator[Tuple[pd.Timestamp, pd.Timestamp, pd.Timestamp, pd.Timestamp, pd.Timestamp, pd.Timestamp]]:
+        """Generates date ranges for walk-forward validation folds based on config.
+
+        Yields:
+            tuple: (train_start, train_end, val_start, val_end, test_start, test_end)
+                   Timestamps are timezone-naive initially.
+        """
+        wf_config = self.config.get('walk_forward', {})
+        if not wf_config.get('enabled', False):
+            logger.info("Walk-forward validation disabled. Performing single split based on ratios.")
+            # Yield None to signal single split mode to the caller
+            yield None
+            return
+
+        # Ensure data is loaded to get the full date range
+        if self.df_raw is None or self.df_raw.empty:
+            logger.error("Cannot generate walk-forward folds: Raw data not loaded yet.")
+            raise SystemExit("Raw data must be loaded before generating walk-forward folds.")
+        if not isinstance(self.df_raw.index, pd.DatetimeIndex):
+             logger.error("Cannot generate walk-forward folds: Raw data index is not DatetimeIndex.")
+             raise SystemExit("Raw data index must be DatetimeIndex for walk-forward.")
+
+        # Get parameters from config
+        train_days = wf_config.get('train_days', 365)
+        val_days = wf_config.get('val_days', 90)
+        test_days = wf_config.get('test_days', 30)
+        step_days = wf_config.get('step_days', 30)
+        initial_offset_days = wf_config.get('initial_offset_days', 0) # Days to skip at the start
+
+        if not all([isinstance(d, int) and d > 0 for d in [train_days, val_days, test_days, step_days]]) or not isinstance(initial_offset_days, int) or initial_offset_days < 0:
+            logger.error("Invalid walk-forward parameters in config (days must be positive integers, offset non-negative). Exiting.")
+            raise SystemExit("Invalid walk_forward configuration.")
+
+        train_delta = pd.Timedelta(days=train_days)
+        val_delta = pd.Timedelta(days=val_days)
+        test_delta = pd.Timedelta(days=test_days)
+        step_delta = pd.Timedelta(days=step_days)
+        offset_delta = pd.Timedelta(days=initial_offset_days)
+
+        full_start_date = self.df_raw.index.min() + offset_delta
+        full_end_date = self.df_raw.index.max()
+
+        # Calculate the end date of the first training period
+        first_train_end = full_start_date + train_delta
+
+        current_val_start = first_train_end
+        fold_count = 0
+
+        logger.info("Generating Walk-Forward Folds:")
+        logger.info(f"  Train={train_days}d, Val={val_days}d, Test={test_days}d, Step={step_days}d, Offset={initial_offset_days}d")
+        logger.info(f"  Full Data Range Available: [{self.df_raw.index.min()}, {self.df_raw.index.max()}]" )
+        logger.info(f"  Starting after offset: {full_start_date}")
+
+        while True:
+            # Define fold boundaries
+            train_start = current_val_start - train_delta
+            train_end = current_val_start # Train ends right before validation starts
+            val_start = current_val_start
+            val_end = val_start + val_delta
+            test_start = val_end
+            test_end = test_start + test_delta
+
+            # Ensure we don't exceed the available data
+            if test_end > full_end_date:
+                # Adjust the last test period if it overshoots
+                test_end = full_end_date
+                # Optional: Check if the remaining test period is too short
+                min_test_period = pd.Timedelta(days=min(1, test_days // 2)) # Example minimum
+                if test_end - test_start < min_test_period:
+                    logger.info(f"Remaining test period too short ({test_end - test_start}). Stopping fold generation.")
+                    break # Stop if the last test period is too short
+
+            # Check if validation period still has data
+            if val_start >= full_end_date:
+                 logger.info("Validation start date reached end of data. Stopping fold generation.")
+                 break
+
+            fold_count += 1
+            logger.info(f"  Fold {fold_count}: Train=[{train_start}, {train_end}), Val=[{val_start}, {val_end}), Test=[{test_start}, {test_end}]")
+            yield train_start, train_end, val_start, val_end, test_start, test_end
+
+            # Move to the next validation start
+            current_val_start += step_delta
+
+            # Break condition if only training/validation is needed (test_days=0)
+            if test_days == 0 and val_end >= full_end_date:
+                 logger.info("Reached end of data for training/validation folds (test_days=0).")
+                 break
+
+        if fold_count == 0:
+             logger.error("No valid walk-forward folds generated. Check data range and walk_forward config parameters.")
+             raise SystemExit("Failed to generate any walk-forward folds.")
+
+    # --- End Walk-Forward Fold Generation --- #
 
 # --- Entry Point --- #