informer_model/scripts/prepare_btc_data.py

import argparse
import glob
import logging
import os
import sqlite3
import pandas as pd
import pandas_ta as ta
import numpy as np
import wandb
import tempfile

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data_from_db(db_path, table_name="klines"):
    """Loads data from a specific table in an SQLite database."""
    logging.info(f"Reading data from {db_path}, table '{table_name}'...")
    try:
        conn = sqlite3.connect(db_path)
        # Query to check if table exists
        cursor = conn.cursor()
        cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")
        if cursor.fetchone() is None:
            logging.warning(f"Table '{table_name}' not found in {db_path}. Skipping.")
            return None
        # Adjust column names if necessary based on your actual schema
        query = f"SELECT timestamp, open, high, low, close, volume FROM {table_name} WHERE instrument_id LIKE 'PAIR-BTC-%'"

        # --- Add logging for the query --- New
        logging.info(f"Executing query: {query}")

        df = pd.read_sql_query(query, conn)

        # --- Add logging for rows read --- New
        logging.info(f"Read {len(df)} rows matching the criteria from {db_path}")
        # --- Log raw timestamp range --- New
        if not df.empty:
            logging.info(f"Raw timestamp range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        # --- End logging ---

    except sqlite3.Error as e:
        logging.error(f"Error reading database {db_path}: {e}")
        return None
    finally:
        if conn:
            conn.close()
    return df

def calculate_features(df):
    """Calculates technical indicators and other features."""
    logging.info("Calculating base features...")
    # df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms') # Removed: Already indexed in main
    # df = df.set_index('datetime').sort_index() # Removed: Already indexed in main

    # Adjust column names if your input df has different names
    # open_col, high_col, low_col, close_col, vol_col = 'open', 'high', 'low', 'close', 'volume' # Old names
    # --- Use the renamed column names --- New
    open_col, high_col, low_col, close_col, vol_col = 'open_price', 'high_price', 'low_price', 'close_price', 'volume'

    # Drop rows with missing essential data before calculations
    df = df.dropna(subset=[open_col, high_col, low_col, close_col, vol_col])
    if df.empty:
        logging.warning("DataFrame is empty after dropping NaNs in essential columns.")
        return df

    # --- Basic Price Features ---
    df['open_to_close_price'] = df[close_col] / df[open_col] - 1
    df['high_to_close_price'] = df[high_col] / df[close_col] - 1
    df['low_to_close_price'] = df[low_col] / df[close_col] - 1
    df['high_to_low_price'] = df[high_col] / df[low_col] - 1

    # --- Returns ---
    # Shift(1) calculates return based on previous close: (close_t / close_{t-1}) - 1
    df['returns'] = df[close_col].pct_change()
    df['log_returns'] = np.log(df[close_col] / df[close_col].shift(1))

    # --- Time Features ---
    df['hour'] = df.index.hour.astype(str).astype("category") # Use string/category as required by config
    df['weekday'] = df.index.weekday.astype(str).astype("category") # Use string/category

    # --- Technical Indicators using pandas_ta ---
    logging.info("Calculating technical indicators (this may take a while)...")
    custom_strategy = ta.Strategy(
        name="informer_features",
        description="Calculate features for Informer model based on config",
        ta=[
            # Volatility (adjust lengths as needed, config doesn't specify)
            {"kind": "atr", "length": 14, "col_names": "atr"}, # Example ATR
            # MACD
            {"kind": "macd", "fast": 12, "slow": 26, "signal": 9, "col_names": ("macd", "macd_hist", "macd_signal")},
            # RSI
            {"kind": "rsi", "length": 14, "col_names": "rsi"},
            # Bollinger Bands
            {"kind": "bbands", "length": 20, "std": 2, "col_names": ("low_bband", "mid_bband", "up_bband", "bandwidth", "percent")},
            # SMA (1h=12*5m, 1d=288*5m, 7d=2016*5m)
            {"kind": "sma", "length": 12, "col_names": "sma_1h"},
            {"kind": "sma", "length": 288, "col_names": "sma_1d"},
            {"kind": "sma", "length": 2016, "col_names": "sma_7d"},
            # EMA (1h=12*5m, 1d=288*5m) - Note: Config only lists ema_1h, ema_1d relative to close
            {"kind": "ema", "length": 12, "col_names": "ema_1h"},
            {"kind": "ema", "length": 288, "col_names": "ema_1d"},
        ]
    )
    df.ta.strategy(custom_strategy)

    # --- Volatility (Calculated on Price/Returns - Choose appropriate source) ---
    # Using log returns is common for volatility calculation
    df['vol_1h'] = df['log_returns'].rolling(window=12).std() * np.sqrt(12) # Scaled 1h vol
    df['vol_1d'] = df['log_returns'].rolling(window=288).std() * np.sqrt(288) # Scaled daily vol
    df['vol_7d'] = df['log_returns'].rolling(window=2016).std() * np.sqrt(2016) # Scaled weekly vol


    # --- Relative Indicators (indicator / close_price) ---
    logging.info("Calculating relative indicators...")
    for indicator in ['low_bband', 'mid_bband', 'up_bband', 'sma_1h', 'sma_1d', 'sma_7d', 'ema_1h', 'ema_1d']:
        if indicator in df.columns:
            df[f'{indicator}_to_close_price'] = df[indicator] / df[close_col] -1
        else:
            logging.warning(f"Base indicator '{indicator}' not found for relative calculation.")

    # --- Clean up intermediate columns if needed ---
    # df = df.drop(columns=['atr', 'macd_hist', 'low_bband', 'mid_bband', 'up_bband', 'bandwidth', 'percent', 'sma_1h', 'sma_1d', 'sma_7d', 'ema_1h', 'ema_1d'])

    # --- Handle initial NaNs introduced by rolling windows/shifts ---
    # returns and log_returns will have NaN for the first row.
    # Indicators will have NaNs for their window length.
    # We will forward-fill later after merging external data.

    return df

def load_external_data(file_path, date_col, value_col, rename_to=None):
    """Loads external daily data like VIX or Fear/Greed Index."""
    logging.info(f"Loading external data from {file_path}...")
    try:
        df = pd.read_csv(file_path)
        df[date_col] = pd.to_datetime(df[date_col])
        # Keep only date and value, rename value column
        df = df[[date_col, value_col]].rename(columns={value_col: rename_to or value_col})
        # --- Normalize the date index --- New
        df = df.set_index(date_col).sort_index()
        df.index = df.index.normalize() # Ensure time is midnight
        logging.info(f"Loaded {len(df)} records from {file_path}. Index normalized.")
        return df
    except FileNotFoundError:
        logging.error(f"External data file not found: {file_path}")
        return None
    except Exception as e:
        logging.error(f"Error loading external data from {file_path}: {e}")
        return None

def main(db_pattern, db_table, vix_file, fear_greed_file, eff_rate_file, args):
    """Main function to load, process, and save data."""

    db_files = glob.glob(os.path.expanduser(db_pattern), recursive=True)
    if not db_files:
        logging.error(f"No database files found matching pattern: {db_pattern}")
        return

    logging.info(f"Found {len(db_files)} database files.")

    all_data = []
    for db_file in db_files:
        df = load_data_from_db(db_file, table_name=db_table)
        if df is not None:
            all_data.append(df)

    if not all_data:
        logging.error("No data loaded from any database file.")
        return

    logging.info("Concatenating data from all databases...")
    btc_df = pd.concat(all_data, ignore_index=True)

    # --- Log raw timestamp info --- New
    if not btc_df.empty:
        logging.info(f"Raw timestamp column info - dtype: {btc_df['timestamp'].dtype}, head:\n{btc_df['timestamp'].head()}")
    else:
        logging.warning("BTC DataFrame empty after concat, cannot check raw timestamp.")
    # --- End logging ---

    # --- Initial Processing ---
    # Convert timestamp to datetime and sort
    btc_df['datetime'] = pd.to_datetime(btc_df['timestamp'], unit='s')
    # --- Add logging to check converted dates --- New
    if not btc_df.empty:
        logging.info(f"Converted datetime range: {btc_df['datetime'].min()} to {btc_df['datetime'].max()}")
    else:
        logging.warning("BTC DataFrame is empty after concatenation, cannot check datetime range.")
    # --- End logging ---
    # Deduplicate based on timestamp, keep first entry
    btc_df = btc_df.sort_values('datetime').drop_duplicates(subset=['timestamp'], keep='first')
    # --- Rename price columns to match config --- Moved Earlier
    rename_map = {'open': 'open_price', 'high': 'high_price', 'low': 'low_price', 'close': 'close_price'}
    btc_df = btc_df.rename(columns=rename_map)
    logging.info(f"Renamed columns: {rename_map}")
    # --- Set index and log info ---
    btc_df = btc_df.set_index('datetime').sort_index()
    if not btc_df.empty:
        # --- Log index details --- Modified
        logging.info(f"DataFrame index info - dtype: {btc_df.index.dtype}, timezone: {btc_df.index.tz}, range: {btc_df.index.min()} to {btc_df.index.max()}") # Added timezone check
        logging.info(f"DataFrame head(1):\n{btc_df.head(1)}") # Added head check
    else:
        logging.warning("BTC DataFrame empty after setting index.")
    # --- End logging ---
    logging.info(f"Total unique records after concatenation: {len(btc_df)}")

    # --- Resample to 5-minute Intervals --- New
    logging.info("Resampling 1-minute data to 5-minute intervals...")
    resampling_rules = {
        'open_price': 'first',
        'high_price': 'max',
        'low_price': 'min',
        'close_price': 'last',
        'volume': 'sum'
    }
    # Ensure columns exist before resampling
    missing_cols = [col for col in resampling_rules if col not in btc_df.columns]
    if missing_cols:
        logging.error(f"Cannot resample, required columns missing: {missing_cols}")
        return

    btc_df = btc_df[list(resampling_rules.keys())].resample('5T').agg(resampling_rules)
    # Drop rows where resampling might have produced all NaNs (e.g., gaps in original data)
    btc_df.dropna(subset=['open_price', 'high_price', 'low_price', 'close_price'], inplace=True)
    logging.info(f"Resampled data shape: {btc_df.shape}")
    if not btc_df.empty:
         logging.info(f"Resampled index range: {btc_df.index.min()} to {btc_df.index.max()}")
         logging.info(f"Resampled head(1):\n{btc_df.head(1)}")
    else:
        logging.warning("DataFrame empty after resampling.")
        return # Stop if empty after resampling
    # --- End Resampling ---

    # --- Feature Calculation ---
    # Now operates on the 5-minute resampled data
    btc_df = calculate_features(btc_df)
    if btc_df.empty:
        logging.error("DataFrame became empty during feature calculation.")
        return

    # --- Load and Merge External Data ---
    # VIX Data - Assuming daily data
    # vix_df = load_external_data(vix_file, date_col='Date', value_col='VIX Close', rename_to='vix_close_price') # Old
    vix_df = load_external_data(vix_file, date_col='date', value_col='close', rename_to='vix_close_price') # Corrected
    # Fear & Greed Data - Assuming daily data
    # fg_df = load_external_data(fear_greed_file, date_col='timestamp', value_col='value', rename_to='fear_greed_index') # Old
    fg_df = load_external_data(fear_greed_file, date_col='date', value_col='fng_value', rename_to='fear_greed_index') # Corrected
    # --- Load Effective Rates Data ---
    eff_rates_df = load_external_data(eff_rate_file, date_col='observation_date', value_col='DFF', rename_to='effective_rates')

    # --- Log External Data Index Info & Timezones --- Modified
    if vix_df is not None: logging.info(f"VIX index info - dtype: {vix_df.index.dtype}, timezone: {vix_df.index.tz}, range: {vix_df.index.min()} to {vix_df.index.max()}")
    if fg_df is not None: logging.info(f"F&G index info - dtype: {fg_df.index.dtype}, timezone: {fg_df.index.tz}, range: {fg_df.index.min()} to {fg_df.index.max()}")
    if eff_rates_df is not None: logging.info(f"EffRates index info - dtype: {eff_rates_df.index.dtype}, timezone: {eff_rates_df.index.tz}, range: {eff_rates_df.index.min()} to {eff_rates_df.index.max()}")

    # --- Log external data near BTC start --- New
    if not btc_df.empty:
        first_btc_time = btc_df.index.min()
        logging.info(f"First BTC timestamp: {first_btc_time}")
        if vix_df is not None:
            logging.info(f"VIX data at/before start:\n{vix_df[vix_df.index <= first_btc_time].tail()}")
        if fg_df is not None:
             logging.info(f"F&G data at/before start:\n{fg_df[fg_df.index <= first_btc_time].tail()}")
        if eff_rates_df is not None:
             logging.info(f"EffRates data at/before start:\n{eff_rates_df[eff_rates_df.index <= first_btc_time].tail()}")
    # --- End logging ---

    # --- Perform merge_asof ---
    logging.info("Performing merge_asof based on DatetimeIndex...")

    # Ensure DataFrames are sorted by index (should be, but explicit is safer)
    btc_df = btc_df.sort_index()
    if vix_df is not None: vix_df = vix_df.sort_index()
    if fg_df is not None: fg_df = fg_df.sort_index()
    if eff_rates_df is not None: eff_rates_df = eff_rates_df.sort_index()

    if vix_df is not None:
        btc_df = pd.merge_asof(btc_df, vix_df, left_index=True, right_index=True, direction='backward')
        logging.info(f"Shape after VIX merge_asof: {btc_df.shape}, VIX NaNs: {btc_df['vix_close_price'].isna().sum()}")
    if fg_df is not None:
        btc_df = pd.merge_asof(btc_df, fg_df, left_index=True, right_index=True, direction='backward')
        logging.info(f"Shape after F&G merge_asof: {btc_df.shape}, F&G NaNs: {btc_df['fear_greed_index'].isna().sum()}")
    if eff_rates_df is not None:
        btc_df = pd.merge_asof(btc_df, eff_rates_df, left_index=True, right_index=True, direction='backward')
        logging.info(f"Shape after EffRates merge_asof: {btc_df.shape}, EffRates NaNs: {btc_df['effective_rates'].isna().sum()}")

    logging.info("Finished merge_asof operations.")

    # --- End Merge Block ---

    # --- Add logging after merge ---
    logging.info(f"BTC data after merge - Shape: {btc_df.shape}, Null counts:\n{btc_df.isna().sum().sort_values(ascending=False).head()}")

    # --- Final Preparations ---
    logging.info("Performing final data preparation steps...")

    # Add required columns not generated yet
    btc_df['group_id'] = "BTC-USDT" # Static group ID for this dataset
    btc_df['group_id'] = btc_df['group_id'].astype("category")

    # Create the sequential time index required by pytorch-forecasting
    btc_df = btc_df.sort_index() # Ensure sorted before creating index

    # --- Add close_time column --- New
    # The index represents the start of the 5min interval
    # Close time is 5 minutes after the start
    btc_df['close_time'] = btc_df.index + pd.Timedelta(minutes=5)
    logging.info(f"Added 'close_time' column. Head:\n{btc_df['close_time'].head()}")
    # --- End Add close_time ---

    btc_df = btc_df.reset_index() # Bring datetime back as a column temporarily
    btc_df['time_index'] = btc_df.index # Create sequential integer index

    # Define final columns based on the YAML config (ensure all generated features are included)
    # Make sure these names match exactly what was generated
    final_columns = [
        "time_index", "group_id", "returns", # Core fields
        "close_time", # Added missing column
        # dynamic_unknown_real
        "high_price", "low_price", "open_price", "close_price", "volume",
        "open_to_close_price", "high_to_close_price", "low_to_close_price", "high_to_low_price",
        "log_returns", "vol_1h", "macd", "macd_signal", "rsi",
        "low_bband_to_close_price", "up_bband_to_close_price", "mid_bband_to_close_price",
        "sma_1h_to_close_price", "sma_1d_to_close_price", "sma_7d_to_close_price",
        "ema_1h_to_close_price", "ema_1d_to_close_price",
         # dynamic_known_real (Check if these exist after merge)
        "vix_close_price", "fear_greed_index", "vol_1d", "vol_7d", "effective_rates",
        # dynamic_known_cat
        "hour", "weekday"
    ]
    # TODO: Add 'effective_rates' back if you load and merge it

    # Select and reorder columns, handling potential missing external cols
    cols_to_select = []
    for col in final_columns:
        if col in btc_df.columns:
            cols_to_select.append(col)
        else:
            logging.warning(f"Required column '{col}' not found in DataFrame. It will be excluded.")
    final_df = btc_df[cols_to_select]

    # --- Handle Missing Values ---
    # Forward fill is common for time series, especially after merges and indicator calculations
    # Note: Ffill might not be suitable for returns, but initial NaNs in returns are expected.
    # Consider specific handling if needed.
    logging.info(f"Forward filling NaNs. Initial NaN count:\n{final_df.isna().sum().sort_values(ascending=False).head()}")
    final_df = final_df.ffill()

    # Drop any rows that *still* have NaNs (e.g., at the very beginning before first external data point or calc window)
    initial_rows = len(final_df)
    # --- Modify dropna to be less aggressive --- New # Comment Needs Update
    # Define critical columns that *must* be present
    # critical_cols = ['open_price', 'high_price', 'low_price', 'close_price', 'volume', 'returns']
    # Check if critical columns exist before using them in subset
    # subset_cols = [col for col in critical_cols if col in final_df.columns]
    # if not subset_cols:
    #     logging.warning("No critical columns found for dropna subset. Skipping dropna.")
    # else:
    #     logging.info(f"Dropping rows where any of {subset_cols} are NaN.")
    #     final_df = final_df.dropna(subset=subset_cols)

    # --- Drop rows with ANY NaN value --- Modified
    logging.info(f"Dropping rows with any NaN values.")
    final_df = final_df.dropna()
    # --- End modification ---
    rows_dropped = initial_rows - len(final_df)
    if rows_dropped > 0:
        logging.warning(f"Dropped {rows_dropped} rows containing NaNs after forward filling.")

    # Final check
    if final_df.isna().any().any():
         logging.warning(f"NaN values still present after processing:\n{final_df.isna().sum()[final_df.isna().sum() > 0]}")
    else:
        logging.info("No remaining NaN values detected.")


    if final_df.empty:
        logging.error("Final DataFrame is empty after processing and NaN handling.")
        return

    # --- Removed Data Splitting Logic ---
    # split_ratio = 0.8 # Use 80% for in-sample
    # split_index = int(len(final_df) * split_ratio)
    #
    # in_sample_df = final_df.iloc[:split_index]
    # out_of_sample_df = final_df.iloc[split_index:]
    #
    # logging.info(f"Split data: {len(in_sample_df)} in-sample rows, {len(out_of_sample_df)} out-of-sample rows.")
    # logging.info(f"In-sample time range: {in_sample_df['time_index'].min()} to {in_sample_df['time_index'].max()}")
    # logging.info(f"Out-of-sample time range: {out_of_sample_df['time_index'].min()} to {out_of_sample_df['time_index'].max()}")
    # --- End Split Removal ---

    # --- Log Single Artifact to W&B --- Modified
    logging.info(f"Logging full dataset artifact to W&B project '{wandb.run.project}', run '{wandb.run.name}'...")

    try:
        with tempfile.TemporaryDirectory() as tempdir:
            # Save the entire final_df
            full_data_path = os.path.join(tempdir, 'full_data.parquet')
            final_df.to_parquet(full_data_path, index=False)
            logging.info(f"Temporary file saved to {tempdir}")

            # Create and log the single artifact
            full_artifact = wandb.Artifact(
                name=args.full_dataset_artifact_name, # Use new arg
                type='dataset',
                description=f'Full BTC 5min features data ({len(final_df)} rows). Prepared by run {wandb.run.id}.',
                metadata={'rows': len(final_df)}
            )
            full_artifact.add_file(full_data_path)
            wandb.log_artifact(full_artifact)
            logging.info(f"Logged full dataset artifact: {args.full_dataset_artifact_name}")

            # --- Removed logging for separate artifacts ---
            # # Create and log the IN-SAMPLE artifact
            # in_sample_artifact = wandb.Artifact(
            #     name=args.in_sample_artifact_name, # Use arg
            #     type='dataset',
            #     description=f'In-sample BTC 5min data ({len(in_sample_df)} rows). Prepared by run {wandb.run.id}.',
            #     metadata={'rows': len(in_sample_df), 'split': 'in_sample'}
            # )
            # in_sample_artifact.add_file(in_sample_path)
            # wandb.log_artifact(in_sample_artifact)
            # logging.info(f"Logged in-sample artifact: {args.in_sample_artifact_name}")
            #
            # # Create and log the OUT-OF-SAMPLE artifact
            # out_of_sample_artifact = wandb.Artifact(
            #     name=args.out_of_sample_artifact_name, # Use arg
            #     type='dataset',
            #     description=f'Out-of-sample BTC 5min data ({len(out_of_sample_df)} rows). Prepared by run {wandb.run.id}.',
            #     metadata={'rows': len(out_of_sample_df), 'split': 'out_of_sample'}
            # )
            # out_of_sample_artifact.add_file(out_of_sample_path)
            # wandb.log_artifact(out_of_sample_artifact)
            # logging.info(f"Logged out-of-sample artifact: {args.out_of_sample_artifact_name}")

        logging.info("Artifact logged successfully.")

    except Exception as e:
        logging.error(f"Error logging artifacts to W&B: {e}")
        wandb.run.finish(exit_code=1) # Finish run with error
        return
    # --- End W&B Logging ---

    wandb.run.finish() # Finish run successfully


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Prepare BTC-USDT 5-minute data and log to W&B.")
    parser.add_argument(
        "--db-pattern",
        default="/home/yasha/develop/data/combined.coinbase_1min_hist.db",
        help="Pattern or exact path to find input SQLite database file(s)."
    )
    parser.add_argument(
        "--db-table",
        default="combined_hist_1min",
        help="Name of the table containing kline data within the SQLite files."
    )
    parser.add_argument(
        "--vix-file",
        default="data/vix_daily.csv",
        help="Path to the VIX index CSV file."
    )
    parser.add_argument(
        "--fear-greed-file",
        default="data/fear_greed_index.csv",
        help="Path to the Crypto Fear & Greed Index CSV file."
    )
    parser.add_argument(
        "--eff-rate-file",
        default="data/DFF.csv",
        help="Path to the Effective Rates CSV file."
    )
    parser.add_argument(
        "--wandb-project",
        default="wne-masters-thesis-testing",
        help="W&B project name."
    )
    parser.add_argument(
        "--wandb-run-name",
        default="prepare-btc-data",
        help="W&B run name for this preparation job."
    )
    parser.add_argument(
        "--wandb-notes",
        default=None,
        help="Optional notes for the W&B run."
    )
    parser.add_argument(
        "--full-dataset-artifact-name",
        default="btc-5m-features-full", # Match YAML default
        help="Name for the single W&B artifact containing the full dataset."
    )
    args = parser.parse_args()

    # --- Initialize W&B Run --- New
    run = wandb.init(
        project=args.wandb_project,
        name=args.wandb_run_name,
        notes=args.wandb_notes,
        job_type="data-preparation",
        config=vars(args) # Log command line args
    )
    # --- End W&B Init ---

    # --- Pass args to main --- Modified
    main(
        db_pattern=args.db_pattern,
        db_table=args.db_table,
        vix_file=args.vix_file,
        fear_greed_file=args.fear_greed_file,
        eff_rate_file=args.eff_rate_file,
        args=args # Pass all args for artifact names etc.
    )