market_predictor/market_predictor/analysis.py

"""
Market Prediction Analysis Module
===============================

This module provides tools for analyzing market prediction performance metrics and visualizations.

Features
--------
- Prediction accuracy analysis
- VWAP and price comparison
- Performance metrics calculation
- Time-series visualizations
- Error analysis

Example Usage
------------
```python
analyzer = PredictionAnalyzer("predictions.csv")
analyzer.plot_accuracy_over_time()
analyzer.plot_hourly_performance()
analyzer.plot_returns_distribution()
```
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from pathlib import Path
import argparse
from datetime import datetime

class PredictionAnalyzer:
    """
    This class provides methods for analyzing prediction accuracy,
    comparing predicted vs actual values, and generating performance
    visualizations for both VWAP and price predictions.

    Attributes
    ----------
    df : pd.DataFrame
        Predictions data with timestamp index
    metrics : Dict[str, Union[float, Dict]]
        Calculated performance metrics

    Methods
    -------
    plot_accuracy_over_time()
        Plots prediction accuracy trends
    plot_hourly_performance()
        Plots hourly performance metrics
    plot_returns_distribution()
        Plots return distribution analysis
    plot_confusion_matrix()
        Plots prediction confusion matrix
    """

    def __init__(self, predictions_file: Union[str, Path]) -> None:
        """
        Initialize analyzer with predictions data.

        Parameters
        ----------
        predictions_file : str or Path
            Path to CSV file containing predictions data
        """
        self.df = pd.read_csv(predictions_file)
        self.df['timestamp_prediction'] = pd.to_datetime(self.df['timestamp_prediction'])
        self._calculate_metrics()

    def _calculate_metrics(self) -> None:
        """
        Calculate performance metrics from predictions data.

        Computes:
        - Cumulative returns
        - Rolling accuracy
        - RMSE for VWAP and price predictions
        """
        self.df['cumulative_return'] = self.df['actual_return'].cumsum()
        self.df['rolling_accuracy'] = (
            self.df['prediction_correct'].rolling(20, min_periods=1).mean()
        )

    def plot_accuracy_over_time(self) -> None:
        """
        Plot prediction accuracy trends over time.

        Generates:
        - Direction accuracy plot
        - VWAP change comparison plot
        """
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

        # Direction accuracy
        ax1.plot(self.df['timestamp_prediction'],
                self.df['prediction_correct'].rolling(20).mean(),
                label='Direction Accuracy', color='blue')
        ax1.set_title('Prediction Accuracy Over Time')
        ax1.set_ylabel('Direction Accuracy')
        ax1.grid(True, alpha=0.3)
        ax1.legend()

        # Magnitude accuracy
        ax2.plot(self.df['timestamp_prediction'],
                self.df['actual_vwap_change'],
                label='Actual VWAP Change', alpha=0.6)
        ax2.plot(self.df['timestamp_prediction'],
                self.df['expected_vwap_change'],
                label='Predicted VWAP Change', alpha=0.6)
        ax2.set_title('VWAP Change Prediction vs Actual')
        ax2.set_ylabel('VWAP Change %')
        ax2.grid(True, alpha=0.3)
        ax2.legend()

        plt.tight_layout()

    def plot_confusion_matrix(self):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Direction confusion matrix
        cm_direction = confusion_matrix(
            self.df['actual_movement'],
            self.df['vwap_direction_next_5min']
        )
        sns.heatmap(cm_direction, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['down', 'up'],
                    yticklabels=['down', 'up'],
                    ax=ax1)
        ax1.set_title('Direction Prediction Matrix')

        # Magnitude error distribution
        magnitude_error = self.df['actual_vwap_change'] - self.df['expected_vwap_change']
        sns.histplot(magnitude_error, bins=50, ax=ax2)
        ax2.set_title('VWAP Change Prediction Error')
        ax2.set_xlabel('Error (Actual - Predicted)')

        plt.tight_layout()

    def plot_returns_distribution(self):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Returns by prediction accuracy
        sns.histplot(data=self.df, x='actual_return',
                    hue='prediction_correct', bins=50, ax=ax1)
        ax1.set_title('Return Distribution by Direction Accuracy')

        # VWAP vs Price changes
        sns.scatterplot(data=self.df,
                       x='actual_vwap_change',
                       y='actual_price_change',
                       hue='prediction_correct',
                       alpha=0.6,
                       ax=ax2)
        ax2.set_title('VWAP vs Price Changes')
        ax2.set_xlabel('VWAP Change')
        ax2.set_ylabel('Price Change')

        plt.tight_layout()

    def plot_hourly_performance(self):
        """Plot performance metrics, VWAP and price prediction accuracy"""
        # Calculate RMSE for both VWAP and price
        self.df['vwap_rmse'] = np.sqrt(
            (self.df['actual_next_vwap'] - self.df['predicted_vwap'])**2
        ).rolling(window=20).mean()

        self.df['price_rmse'] = np.sqrt(
            (self.df['actual_next_price'] - self.df['predicted_price'])**2
        ).rolling(window=20).mean()

        # Group metrics by hour
        hourly_metrics = self.df.groupby(
            pd.Grouper(key='timestamp_prediction', freq='H')
        ).agg({
            'prediction_correct': 'mean',
            'actual_return': ['mean', 'sum'],
            'actual_vwap_change': 'mean',
            'predicted_vwap': 'mean',
            'actual_next_vwap': 'mean',
            'predicted_price': 'mean',
            'actual_next_price': 'mean',
            'vwap_rmse': 'mean',
            'price_rmse': 'mean'
        }).reset_index()

        fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(15, 20))

        # Plot 1: Direction Accuracy
        ax1.plot(hourly_metrics['timestamp_prediction'],
                hourly_metrics['prediction_correct'],
                marker='o', linestyle='-')
        ax1.set_title('Direction Prediction Accuracy')
        ax1.set_ylabel('Accuracy')
        ax1.grid(True, alpha=0.3)

        # Plot 2: VWAP Comparison
        ax2.plot(hourly_metrics['timestamp_prediction'],
                hourly_metrics['predicted_vwap'],
                label='Predicted VWAP', color='blue')
        ax2.plot(hourly_metrics['timestamp_prediction'],
                hourly_metrics['actual_next_vwap'],
                label='Actual VWAP', color='red')
        ax2.set_title('Predicted vs Actual VWAP')
        ax2.set_ylabel('VWAP Value')
        ax2.grid(True, alpha=0.3)
        ax2.legend()

        # Plot 3: Price Comparison
        ax3.plot(hourly_metrics['timestamp_prediction'],
                hourly_metrics['predicted_price'],
                label='Predicted Price', color='green')
        ax3.plot(hourly_metrics['timestamp_prediction'],
                hourly_metrics['actual_next_price'],
                label='Actual Price', color='orange')
        ax3.set_title('Predicted vs Actual Price')
        ax3.set_ylabel('Price Value')
        ax3.grid(True, alpha=0.3)
        ax3.legend()

        # Plot 4: Rolling RMSE Comparison
        ax4.plot(hourly_metrics['timestamp_prediction'],
                hourly_metrics['vwap_rmse'],
                label='VWAP RMSE', color='purple')
        ax4.plot(hourly_metrics['timestamp_prediction'],
                hourly_metrics['price_rmse'],
                label='Price RMSE', color='brown')
        ax4.set_title('Prediction RMSE (20-period rolling)')
        ax4.set_ylabel('RMSE')
        ax4.grid(True, alpha=0.3)
        ax4.legend()

        # Format datetime x-axis
        for ax in [ax1, ax2, ax3, ax4]:
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
            ax.xaxis.set_major_locator(mdates.HourLocator(interval=4))
            plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')

        plt.tight_layout()

    def plot_vwap_changes_comparison(self, window_size: int = 20):
        """Plot rolling average of predicted vs actual VWAP changes"""
        plt.figure(figsize=(15, 6))

        # Calculate rolling averages
        actual_rolling = self.df['actual_vwap_change'].rolling(window=window_size).mean()
        predicted_rolling = self.df['expected_vwap_change'].rolling(window=window_size).mean()

        # Plot both lines
        plt.plot(self.df['timestamp_prediction'], actual_rolling,
                label='Actual VWAP Change', color='blue', alpha=0.7)
        plt.plot(self.df['timestamp_prediction'], predicted_rolling,
                label='Predicted VWAP Change', color='red', alpha=0.7)

        plt.title(f'Predicted vs Actual VWAP Changes ({window_size}-period rolling average)')
        plt.xlabel('Time')
        plt.ylabel('VWAP Change %')
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.xticks(rotation=45)
        plt.tight_layout()

    def generate_report(self) -> str:
        report = (
            f"\nPerformance Metrics:\n"
            f"Total Predictions: {len(self.df)}\n"
            f"Overall Accuracy: {self.df['prediction_correct'].mean():.2%}\n"
            f"Mean Return: {self.df['actual_return'].mean():.4f}\n"
            f"Cumulative Return: {self.df['actual_return'].sum():.4f}\n\n"
            f"Classification Report:\n"
            f"{classification_report(self.df['actual_movement'], self.df['vwap_direction_next_5min'])}"
        )
        print(report)
        return report

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('predictions_file', help='Path to predictions CSV')
    parser.add_argument('--output-dir', default='analysis_output',
                       help='Directory for output plots')
    args = parser.parse_args()

    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True)

    # Analyze predictions
    analyzer = PredictionAnalyzer(args.predictions_file)
    analyzer.generate_report()

    # Generate and save plots
    analyzer.plot_accuracy_over_time()
    plt.savefig(output_dir / 'accuracy_over_time.png')
    plt.close()

    analyzer.plot_confusion_matrix()
    plt.savefig(output_dir / 'confusion_matrix.png')
    plt.close()

    analyzer.plot_returns_distribution()
    plt.savefig(output_dir / 'returns_distribution.png')
    plt.close()

    analyzer.plot_hourly_performance()
    plt.savefig(output_dir / 'hourly_performance.png')
    plt.close()

    # Add VWAP changes comparison plot
    analyzer.plot_vwap_changes_comparison(window_size=20)
    plt.savefig(output_dir / 'vwap_changes_comparison.png')
    plt.close()

    plt.show()

if __name__ == "__main__":
    main()