added new ft model
This commit is contained in:
parent
5dcab576c3
commit
7b1f8c6d9a
@ -9,7 +9,7 @@ if not OPENAI_API_KEY:
|
||||
raise ValueError("OpenAI API key not found in environment variables")
|
||||
|
||||
# Model Configuration
|
||||
MODEL_NAME = "ft:gpt-4o-mini-2024-07-18:yasha-sheynin::Awacdfg6"
|
||||
MODEL_NAME = 'ft:gpt-4o-mini-2024-07-18:yasha-sheynin::AwgWhL48' #"gpt-4o-2024-08-06" #"ft:gpt-4o-mini-2024-07-18:yasha-sheynin::Awacdfg6"
|
||||
|
||||
# RAG Configuration
|
||||
VECTOR_STORE_TYPE = "faiss"
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
import sys
|
||||
import os
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
@ -5,213 +7,149 @@ import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from openai import OpenAI
|
||||
from typing import List, Dict
|
||||
from .config import OPENAI_API_KEY
|
||||
from collections import Counter
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from main import analyze_market_data
|
||||
from .market_data_fetcher import MarketDataFetcher
|
||||
from .data_processor import MarketDataProcessor
|
||||
|
||||
from .config import OPENAI_API_KEY
|
||||
from .rag_engine import RAGEngine
|
||||
|
||||
class FineTuneDatasetGenerator:
|
||||
def __init__(self, symbols: List[str], lookback_days: int = 30):
|
||||
def __init__(self,
|
||||
symbols: List[str],
|
||||
lookback_days: int = 30,
|
||||
training_window_size: int = 60,
|
||||
inference_window_size: int = 12,
|
||||
inference_offset: int = 0,
|
||||
interval: str = '5m'):
|
||||
self.symbols = symbols
|
||||
self.lookback_days = lookback_days
|
||||
self.training_window_size = training_window_size
|
||||
self.inference_window_size = inference_window_size
|
||||
self.inference_offset = inference_offset
|
||||
self.interval = interval
|
||||
self.client = OpenAI(api_key=OPENAI_API_KEY)
|
||||
self.rag_engine = RAGEngine()
|
||||
|
||||
async def generate_dataset(self) -> List[Dict]:
|
||||
"""Generate labeled dataset for fine-tuning"""
|
||||
"""Generate labeled dataset using correct predictions only"""
|
||||
examples = []
|
||||
|
||||
for symbol in tqdm(self.symbols, desc="Processing symbols"):
|
||||
# Fetch historical data
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=self.lookback_days)
|
||||
|
||||
# Get predictions using analyze_market_data
|
||||
fetcher = MarketDataFetcher(symbol)
|
||||
market_data = fetcher.fetch_data(
|
||||
start_date=start_date.strftime('%Y-%m-%d'),
|
||||
end_date=end_date.strftime('%Y-%m-%d'),
|
||||
interval='5m'
|
||||
interval=self.interval
|
||||
)
|
||||
|
||||
# Process market data
|
||||
processor = MarketDataProcessor(market_data)
|
||||
processed_data = processor.df
|
||||
predictions_df = await analyze_market_data(
|
||||
market_data=market_data,
|
||||
training_window_size=self.training_window_size,
|
||||
inference_window_size=self.inference_window_size,
|
||||
inference_offset=self.inference_offset
|
||||
)
|
||||
|
||||
# Generate training examples
|
||||
examples.extend(self._generate_examples(processed_data))
|
||||
if not predictions_df.empty:
|
||||
correct_examples = self._convert_to_training_examples(predictions_df)
|
||||
examples.extend(correct_examples)
|
||||
print(f"Added {len(correct_examples)} examples from {symbol}")
|
||||
|
||||
return examples
|
||||
|
||||
def _generate_examples(self, data: pd.DataFrame) -> List[Dict]:
|
||||
"""Generate labeled examples from processed market data"""
|
||||
def _convert_to_training_examples(self, predictions_df: pd.DataFrame) -> List[Dict]:
|
||||
"""Convert correct predictions to training examples with validation"""
|
||||
examples = []
|
||||
window_size = 12 # 1-hour context
|
||||
|
||||
for i in range(len(data) - window_size):
|
||||
window = data.iloc[i:i+window_size]
|
||||
next_row = data.iloc[i+window_size] if i+window_size < len(data) else None
|
||||
# Print DataFrame info for debugging
|
||||
print("\nDataFrame Info:")
|
||||
print(predictions_df.info())
|
||||
print("\nSample row:")
|
||||
print(predictions_df.iloc[0])
|
||||
|
||||
if next_row is not None:
|
||||
# Create market state description
|
||||
context = self._create_context(window)
|
||||
# Filter for correct predictions
|
||||
correct_mask = predictions_df['vwap_direction_next_5min'] == predictions_df['actual_movement']
|
||||
correct_predictions = predictions_df[correct_mask].copy()
|
||||
|
||||
# Generate label
|
||||
label = self._create_label(window, next_row)
|
||||
print(f"Found {len(correct_predictions)} correct predictions out of {len(predictions_df)} total")
|
||||
|
||||
for _, pred in correct_predictions.iterrows():
|
||||
try:
|
||||
context = self._create_market_context(pred)
|
||||
label = self._create_prediction_label(pred)
|
||||
|
||||
examples.append({
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a market analysis AI that predicts short-term price movements."},
|
||||
{"role": "system", "content": self.rag_engine.system_prompt},
|
||||
{"role": "user", "content": context},
|
||||
{"role": "assistant", "content": json.dumps(label)}
|
||||
]
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error processing prediction: {str(e)}")
|
||||
continue
|
||||
|
||||
return examples
|
||||
|
||||
def _create_context(self, window: pd.DataFrame) -> str:
|
||||
"""Create market state description using DataProcessor format"""
|
||||
# Ensure window has required columns
|
||||
required_cols = ['MA5', 'MA20', 'Volume_MA5']
|
||||
missing = [col for col in required_cols if col not in window.columns]
|
||||
if missing:
|
||||
window = window.copy()
|
||||
if "Close" in window.columns:
|
||||
window["MA5"] = window["Close"].rolling(window=5, min_periods=1).mean().bfill()
|
||||
window["MA20"] = window["Close"].rolling(window=20, min_periods=1).mean().bfill()
|
||||
else:
|
||||
window["MA5"] = 0
|
||||
window["MA20"] = 0
|
||||
if "Volume" in window.columns:
|
||||
window["Volume_MA5"] = window["Volume"].rolling(window=5, min_periods=1).mean().bfill()
|
||||
else:
|
||||
window["Volume_MA5"] = 0
|
||||
|
||||
latest = window.iloc[-1]
|
||||
prev = window.iloc[-2] if len(window) > 1 else latest
|
||||
|
||||
# Calculate changes
|
||||
volume_change = ((latest['Volume'] - prev['Volume'])/prev['Volume']*100) if prev['Volume'] > 0 else 0
|
||||
vwap_change = (latest["VWAP"] - prev["VWAP"]) / prev["VWAP"] * 100
|
||||
vwap_direction = "up" if vwap_change > 0 else "down"
|
||||
def _create_market_context(self, row: pd.Series) -> str:
|
||||
"""Create market state description with column validation"""
|
||||
# Print available columns for debugging
|
||||
print(f"Available columns: {row.index.tolist()}")
|
||||
|
||||
# Use safe column access with defaults
|
||||
return f"""Current Market State:
|
||||
Current Price: {latest['Close']:.2f}
|
||||
VWAP: {latest['VWAP']:.2f}
|
||||
Volume: {latest['Volume']}
|
||||
MA5: {latest['MA5']:.2f}
|
||||
MA20: {latest['MA20']:.2f}
|
||||
Volume MA5: {latest['Volume_MA5']:.2f}
|
||||
Price Change: {((latest['Close'] - prev['Close'])/prev['Close']*100):.2f}%
|
||||
Volume Change: {volume_change:.2f}%
|
||||
Previous 5min VWAP Movement: {vwap_direction} ({vwap_change:.2f}%)
|
||||
Time: {latest.name}
|
||||
Current Price: {row.get('close', row.get('Close', 0.0)):.2f}
|
||||
VWAP: {row.get('vwap', row.get('VWAP', 0.0)):.2f}
|
||||
Volume: {row.get('volume', row.get('Volume', 0))}
|
||||
MA5: {row.get('ma5', row.get('MA5', 0.0)):.2f}
|
||||
MA20: {row.get('ma20', row.get('MA20', 0.0)):.2f}
|
||||
Volume MA5: {row.get('volume_ma5', row.get('Volume_MA5', 0.0)):.2f}
|
||||
Price Change: {row.get('price_change', 0.0):.2f}%
|
||||
Volume Change: {row.get('volume_change', 0.0):.2f}%
|
||||
Previous VWAP Movement: {row.get('prev_vwap_direction', 'none')}
|
||||
Time: {row.name}
|
||||
"""
|
||||
|
||||
def _create_label(self, window: pd.DataFrame, next_row: pd.Series) -> Dict:
|
||||
"""Create labeled output"""
|
||||
current_vwap = window.iloc[-1]['VWAP']
|
||||
next_vwap = next_row['VWAP']
|
||||
direction = 'up' if next_vwap > current_vwap else 'down'
|
||||
|
||||
def _create_prediction_label(self, row: pd.Series) -> Dict:
|
||||
"""Create prediction label from actual data"""
|
||||
return {
|
||||
"vwap_direction_next_5min": direction,
|
||||
"confidence_score": 0.8,
|
||||
"expected_vwap_change": ((next_vwap - current_vwap) / current_vwap) * 100,
|
||||
"volatility_estimate": window['VWAP'].std(),
|
||||
"suggested_entry": current_vwap,
|
||||
"suggested_stop_loss": current_vwap * 0.997 if direction == 'up' else current_vwap * 1.003,
|
||||
"suggested_take_profit": current_vwap * 1.003 if direction == 'up' else current_vwap * 0.997,
|
||||
"key_signals": self._identify_signals(window),
|
||||
"reasoning": self._generate_reasoning(window, direction)
|
||||
"vwap_direction_next_5min": row['vwap_direction_next_5min'],
|
||||
"confidence_score": row['confidence_score'],
|
||||
"expected_vwap_change": row['expected_vwap_change'],
|
||||
"volatility_estimate": row['volatility_estimate'],
|
||||
"suggested_entry": row['suggested_entry'],
|
||||
"suggested_stop_loss": row['suggested_stop_loss'],
|
||||
"suggested_take_profit": row['suggested_take_profit'],
|
||||
"key_signals": row['key_signals'],
|
||||
"reasoning": row['reasoning']
|
||||
}
|
||||
|
||||
def _identify_signals(self, window: pd.DataFrame) -> Dict:
|
||||
"""
|
||||
Identify technical signals from the market data window
|
||||
|
||||
Args:
|
||||
window (pd.DataFrame): DataFrame containing market data for analysis
|
||||
|
||||
Returns:
|
||||
Dict: Dictionary containing identified signals
|
||||
"""
|
||||
return {
|
||||
"trend": self._calculate_trend(window),
|
||||
"volume_trend": "increasing" if window['Volume'].iloc[-1] > window['Volume_MA5'].iloc[-1] else "decreasing"
|
||||
}
|
||||
|
||||
def _calculate_trend(self, window: pd.DataFrame) -> str:
|
||||
"""
|
||||
Calculate the price trend based on moving averages
|
||||
|
||||
Args:
|
||||
window (pd.DataFrame): Market data window with MA5 and MA20 columns
|
||||
|
||||
Returns:
|
||||
str: Trend direction ('upward', 'downward', or 'sideways')
|
||||
"""
|
||||
last_row = window.iloc[-1]
|
||||
ma5 = last_row['MA5']
|
||||
ma20 = last_row['MA20']
|
||||
|
||||
# Calculate trend based on MA crossover
|
||||
if ma5 > ma20 * 1.02: # 2% threshold
|
||||
return "upward"
|
||||
elif ma5 < ma20 * 0.98: # 2% threshold
|
||||
return "downward"
|
||||
else:
|
||||
return "sideways"
|
||||
|
||||
def _generate_reasoning(self, window: pd.DataFrame, direction: str) -> str:
|
||||
"""
|
||||
Generate reasoning for the market prediction
|
||||
|
||||
Args:
|
||||
window (pd.DataFrame): Market data window
|
||||
direction (str): Predicted price direction ('up' or 'down')
|
||||
|
||||
Returns:
|
||||
str: Generated reasoning for the prediction
|
||||
"""
|
||||
signals = self._identify_signals(window)
|
||||
last_row = window.iloc[-1]
|
||||
|
||||
reasoning_parts = []
|
||||
|
||||
# Analyze trend
|
||||
if signals['trend'] == direction:
|
||||
reasoning_parts.append(f"The {signals['trend']} trend supports this prediction")
|
||||
|
||||
# Analyze volume
|
||||
if signals['volume_trend'] == 'increasing':
|
||||
reasoning_parts.append("Increasing volume suggests strong momentum")
|
||||
else:
|
||||
reasoning_parts.append("Decreasing volume suggests potential trend weakness")
|
||||
|
||||
# VWAP analysis
|
||||
vwap = last_row['VWAP']
|
||||
close = last_row['Close']
|
||||
if close > vwap and direction == 'up':
|
||||
reasoning_parts.append("Price above VWAP supports bullish momentum")
|
||||
elif close < vwap and direction == 'down':
|
||||
reasoning_parts.append("Price below VWAP supports bearish momentum")
|
||||
|
||||
return ". ".join(reasoning_parts) + "."
|
||||
|
||||
async def create_fine_tuning_job(self, examples: List[Dict]):
|
||||
"""Create and monitor fine-tuning job"""
|
||||
if not examples:
|
||||
raise ValueError("No examples provided for fine-tuning")
|
||||
|
||||
# Save examples to JSONL file
|
||||
with open('training_data.jsonl', 'w') as f:
|
||||
output_path = 'training_data.jsonl'
|
||||
with open(output_path, 'w') as f:
|
||||
for example in examples:
|
||||
f.write(json.dumps(example) + '\n')
|
||||
|
||||
# Upload training file - remove await
|
||||
print(f"Saved {len(examples)} examples to {output_path}")
|
||||
|
||||
# Upload training file
|
||||
training_file = self.client.files.create(
|
||||
file=open('training_data.jsonl', 'rb'),
|
||||
file=open(output_path, 'rb'),
|
||||
purpose='fine-tune'
|
||||
)
|
||||
|
||||
# Create fine-tuning job - remove await
|
||||
# Create fine-tuning job
|
||||
job = self.client.fine_tuning.jobs.create(
|
||||
training_file=training_file.id,
|
||||
model="gpt-4o-mini-2024-07-18",
|
||||
@ -224,16 +162,23 @@ Time: {latest.name}
|
||||
return job.id
|
||||
|
||||
async def main():
|
||||
symbols = ['BTC-USD']
|
||||
generator = FineTuneDatasetGenerator(symbols)
|
||||
"""Run dataset generation and fine-tuning"""
|
||||
symbols = ['BTC-USD', 'NVDA', 'META', 'LTC-USD']
|
||||
generator = FineTuneDatasetGenerator(
|
||||
symbols=symbols,
|
||||
lookback_days=2,
|
||||
training_window_size=60,
|
||||
inference_window_size=12,
|
||||
inference_offset=0,
|
||||
interval='5m'
|
||||
)
|
||||
|
||||
# Generate dataset
|
||||
examples = await generator.generate_dataset()
|
||||
print(f"Generated {len(examples)} training examples")
|
||||
|
||||
# Create fine-tuning job
|
||||
job_id = await generator.create_fine_tuning_job(examples)
|
||||
print(f"Fine-tuning job started. Monitor progress using: openai api fine_tunes.follow -i {job_id}")
|
||||
if examples:
|
||||
job_id = await generator.create_fine_tuning_job(examples)
|
||||
print(f"Fine-tuning job started. Monitor progress using job ID: {job_id}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -8,10 +8,26 @@ class PerformanceMetrics:
|
||||
def __init__(self, predictions_df: pd.DataFrame, market_data: pd.DataFrame):
|
||||
self.predictions_df = predictions_df
|
||||
self.market_data = market_data
|
||||
self._calculate_actual_movements()
|
||||
self.metrics = self._calculate_metrics()
|
||||
|
||||
def _calculate_actual_movements(self):
|
||||
# Calculate actual movements first
|
||||
self.predictions_df['actual_movement'] = self._calculate_actual_movements()
|
||||
|
||||
# Map vwap_direction_next_5min to predicted_movement if it exists
|
||||
if 'vwap_direction_next_5min' in self.predictions_df.columns:
|
||||
self.predictions_df['predicted_movement'] = self.predictions_df['vwap_direction_next_5min']
|
||||
elif 'direction' in self.predictions_df.columns:
|
||||
self.predictions_df['predicted_movement'] = self.predictions_df['direction']
|
||||
else:
|
||||
raise ValueError("No prediction column found in DataFrame. Expected 'vwap_direction_next_5min' or 'direction'")
|
||||
|
||||
# Now extract y_true and y_pred
|
||||
y_true = self.predictions_df['actual_movement']
|
||||
y_pred = self.predictions_df['predicted_movement']
|
||||
|
||||
# Calculate metrics
|
||||
self.metrics = self._calculate_metrics(y_true, y_pred)
|
||||
|
||||
def _calculate_actual_movements(self) -> pd.Series:
|
||||
"""Calculate actual VWAP movements with detailed logging"""
|
||||
print("\nDebug Counts:")
|
||||
print(f"Initial DataFrame rows: {len(self.predictions_df)}")
|
||||
@ -37,79 +53,76 @@ class PerformanceMetrics:
|
||||
except KeyError:
|
||||
movements.append(None)
|
||||
skipped_timestamps += 1
|
||||
print(f"Skipped: Timestamp not found {timestamp}")
|
||||
print(f"Skipped: Timestamp {timestamp} not found in market data")
|
||||
|
||||
print(f"\nProcessing Summary:")
|
||||
print(f"Total rows initially: {len(self.predictions_df)}")
|
||||
print(f"Valid predictions: {valid_predictions}")
|
||||
print(f"Skipped timestamps: {skipped_timestamps}")
|
||||
|
||||
self.predictions_df['actual_movement'] = movements
|
||||
valid_mask = self.predictions_df['actual_movement'].notna()
|
||||
self.predictions_df = self.predictions_df[valid_mask].copy()
|
||||
return pd.Series(movements, index=self.predictions_df.index)
|
||||
|
||||
print(f"Final predictions count: {len(self.predictions_df)}\n")
|
||||
def _calculate_metrics(self, y_true: pd.Series, y_pred: pd.Series) -> Dict:
|
||||
"""Calculate performance metrics with None value handling"""
|
||||
# Filter out rows with None values
|
||||
valid_mask = y_true.notna() & y_pred.notna()
|
||||
y_true_clean = y_true[valid_mask]
|
||||
y_pred_clean = y_pred[valid_mask]
|
||||
|
||||
def _calculate_metrics(self) -> dict:
|
||||
if len(self.predictions_df) == 0:
|
||||
return self._empty_metrics()
|
||||
if len(y_true_clean) == 0:
|
||||
return self._get_empty_metrics()
|
||||
|
||||
y_true = self.predictions_df['actual_movement']
|
||||
y_pred = self.predictions_df['vwap_direction_next_5min']
|
||||
|
||||
print("\nClass distributions:")
|
||||
print("Actual:", y_true.value_counts().to_dict())
|
||||
print("Predicted:", y_pred.value_counts().to_dict())
|
||||
|
||||
acc = accuracy_score(y_true, y_pred)
|
||||
prec = precision_score(y_true, y_pred, pos_label='up', zero_division=0)
|
||||
rec = recall_score(y_true, y_pred, pos_label='up', zero_division=0)
|
||||
f1 = f1_score(y_true, y_pred, pos_label='up', zero_division=0)
|
||||
|
||||
# High confidence metrics
|
||||
high_conf_mask = self.predictions_df['confidence_score'] >= 0.7
|
||||
if high_conf_mask.any():
|
||||
high_conf_correct = ((y_pred == y_true) & high_conf_mask).sum()
|
||||
high_conf_acc = high_conf_correct / high_conf_mask.sum()
|
||||
else:
|
||||
high_conf_acc = 0.0
|
||||
|
||||
# Print confusion matrix for debugging
|
||||
cm = confusion_matrix(y_true, y_pred)
|
||||
print("\nConfusion Matrix:")
|
||||
print(pd.DataFrame(
|
||||
cm,
|
||||
columns=['Pred Down', 'Pred Up'],
|
||||
index=['True Down', 'True Up']
|
||||
))
|
||||
|
||||
# Keep existing metrics calculation
|
||||
# Calculate base metrics
|
||||
metrics = {
|
||||
'total_predictions': len(self.predictions_df),
|
||||
'class_distribution': y_pred.value_counts().to_dict(),
|
||||
'avg_confidence': self.predictions_df['confidence_score'].mean(),
|
||||
'accuracy': acc,
|
||||
'precision': prec,
|
||||
'recall': rec,
|
||||
'f1': f1,
|
||||
'high_confidence_accuracy': high_conf_acc
|
||||
'total_predictions': len(y_true),
|
||||
'valid_predictions': len(y_true_clean),
|
||||
'accuracy': accuracy_score(y_true_clean, y_pred_clean),
|
||||
'precision': precision_score(y_true_clean, y_pred_clean, pos_label='up', zero_division=0),
|
||||
'recall': recall_score(y_true_clean, y_pred_clean, pos_label='up', zero_division=0),
|
||||
'f1': f1_score(y_true_clean, y_pred_clean, pos_label='up', zero_division=0),
|
||||
'class_distribution': y_pred_clean.value_counts().to_dict()
|
||||
}
|
||||
|
||||
# Add trading metrics
|
||||
# Add confidence and VWAP change metrics
|
||||
metrics.update({
|
||||
'avg_confidence': self.predictions_df['confidence_score'].mean(),
|
||||
'high_confidence_accuracy': self._calculate_high_confidence_accuracy(y_true_clean, y_pred_clean),
|
||||
'avg_expected_vwap_change': self.predictions_df['expected_vwap_change'].mean(),
|
||||
'avg_volatility_estimate': self.predictions_df['volatility_estimate'].mean(),
|
||||
'price_targets': {
|
||||
'entry_success_rate': self._calculate_entry_success(),
|
||||
'stop_loss_hits': self._calculate_stop_loss_hits(),
|
||||
'take_profit_hits': self._calculate_take_profit_hits(),
|
||||
'avg_risk_reward': self._calculate_risk_reward_ratio()
|
||||
},
|
||||
'signals': self._analyze_signals()
|
||||
'avg_volatility_estimate': self.predictions_df['volatility_estimate'].mean()
|
||||
})
|
||||
|
||||
# Add price targets metrics
|
||||
metrics['price_targets'] = {
|
||||
'entry_success_rate': self._calculate_entry_success(),
|
||||
'stop_loss_hits': self._calculate_stop_loss_hits(),
|
||||
'take_profit_hits': self._calculate_take_profit_hits(),
|
||||
'avg_risk_reward': self._calculate_avg_risk_reward()
|
||||
}
|
||||
|
||||
return metrics
|
||||
|
||||
def _get_empty_metrics(self) -> Dict:
|
||||
"""Return empty metrics dictionary with zero values"""
|
||||
return {
|
||||
'total_predictions': 0,
|
||||
'valid_predictions': 0,
|
||||
'accuracy': 0.0,
|
||||
'precision': 0.0,
|
||||
'recall': 0.0,
|
||||
'f1': 0.0,
|
||||
'high_confidence_accuracy': 0.0,
|
||||
'class_distribution': {},
|
||||
'avg_confidence': 0.0,
|
||||
'avg_expected_vwap_change': 0.0,
|
||||
'avg_volatility_estimate': 0.0,
|
||||
'price_targets': {
|
||||
'entry_success_rate': 0.0,
|
||||
'stop_loss_hits': 0.0,
|
||||
'take_profit_hits': 0.0,
|
||||
'avg_risk_reward': 0.0
|
||||
}
|
||||
}
|
||||
|
||||
def _calculate_entry_success(self) -> float:
|
||||
"""Calculate rate of successful entries"""
|
||||
successes = 0
|
||||
@ -129,38 +142,67 @@ class PerformanceMetrics:
|
||||
return successes / total if total > 0 else 0.0
|
||||
|
||||
def _calculate_stop_loss_hits(self) -> float:
|
||||
"""Calculate stop loss hit rate"""
|
||||
"""Calculate stop loss hit rate with proper index bounds checking"""
|
||||
hits = 0
|
||||
total = len(self.predictions_df)
|
||||
total = 0
|
||||
|
||||
for _, row in self.predictions_df.iterrows():
|
||||
stop_loss = row.get('suggested_stop_loss')
|
||||
if stop_loss is None:
|
||||
continue
|
||||
|
||||
# Check if price hit stop loss
|
||||
next_vwap = self.market_data.loc[row['prediction_timestamp']:].iloc[1]['VWAP']
|
||||
if (row['vwap_direction_next_5min'] == 'up' and next_vwap <= stop_loss) or \
|
||||
(row['vwap_direction_next_5min'] == 'down' and next_vwap >= stop_loss):
|
||||
hits += 1
|
||||
# Get next VWAP value safely
|
||||
try:
|
||||
timestamp_idx = self.market_data.index.get_loc(row['prediction_timestamp'])
|
||||
if timestamp_idx + 1 >= len(self.market_data):
|
||||
continue
|
||||
|
||||
next_vwap = self.market_data.iloc[timestamp_idx + 1]['VWAP']
|
||||
total += 1
|
||||
|
||||
# Check if price hit stop loss
|
||||
if (row['predicted_movement'] == 'up' and next_vwap <= stop_loss) or \
|
||||
(row['predicted_movement'] == 'down' and next_vwap >= stop_loss):
|
||||
hits += 1
|
||||
|
||||
except (KeyError, IndexError):
|
||||
print(f"Warning: Could not find next VWAP for timestamp {row['prediction_timestamp']}")
|
||||
continue
|
||||
|
||||
return hits / total if total > 0 else 0.0
|
||||
|
||||
def _calculate_take_profit_hits(self) -> float:
|
||||
"""Calculate take profit hit rate"""
|
||||
"""
|
||||
Calculate take profit hit rate with proper index bounds checking
|
||||
|
||||
Returns:
|
||||
float: Ratio of take profit hits to total valid predictions
|
||||
"""
|
||||
hits = 0
|
||||
total = len(self.predictions_df)
|
||||
total = 0
|
||||
|
||||
for _, row in self.predictions_df.iterrows():
|
||||
take_profit = row.get('suggested_take_profit')
|
||||
if take_profit is None:
|
||||
continue
|
||||
|
||||
# Check if price hit take profit
|
||||
next_vwap = self.market_data.loc[row['prediction_timestamp']:].iloc[1]['VWAP']
|
||||
if (row['vwap_direction_next_5min'] == 'up' and next_vwap >= take_profit) or \
|
||||
(row['vwap_direction_next_5min'] == 'down' and next_vwap <= take_profit):
|
||||
hits += 1
|
||||
try:
|
||||
# Get next VWAP value safely using index location
|
||||
timestamp_idx = self.market_data.index.get_loc(row['prediction_timestamp'])
|
||||
if timestamp_idx + 1 >= len(self.market_data):
|
||||
continue
|
||||
|
||||
next_vwap = self.market_data.iloc[timestamp_idx + 1]['VWAP']
|
||||
total += 1
|
||||
|
||||
# Check if price hit take profit level
|
||||
if (row['predicted_movement'] == 'up' and next_vwap >= take_profit) or \
|
||||
(row['predicted_movement'] == 'down' and next_vwap <= take_profit):
|
||||
hits += 1
|
||||
|
||||
except (KeyError, IndexError):
|
||||
print(f"Warning: Could not find next VWAP for timestamp {row['prediction_timestamp']}")
|
||||
continue
|
||||
|
||||
return hits / total if total > 0 else 0.0
|
||||
|
||||
@ -183,6 +225,39 @@ class PerformanceMetrics:
|
||||
|
||||
return np.mean(ratios) if ratios else 0.0
|
||||
|
||||
def _calculate_avg_risk_reward(self) -> float:
|
||||
"""
|
||||
Calculate average risk/reward ratio across all trades
|
||||
|
||||
Returns:
|
||||
float: Average risk/reward ratio, or 0.0 if no valid trades
|
||||
"""
|
||||
risk_rewards = []
|
||||
|
||||
for _, row in self.predictions_df.iterrows():
|
||||
entry = row.get('suggested_entry')
|
||||
stop_loss = row.get('suggested_stop_loss')
|
||||
take_profit = row.get('suggested_take_profit')
|
||||
|
||||
if None in (entry, stop_loss, take_profit):
|
||||
continue
|
||||
|
||||
# Calculate risk and reward
|
||||
if row['predicted_movement'] == 'up':
|
||||
risk = entry - stop_loss
|
||||
reward = take_profit - entry
|
||||
else: # down
|
||||
risk = stop_loss - entry
|
||||
reward = entry - take_profit
|
||||
|
||||
# Avoid division by zero
|
||||
if risk <= 0 or reward <= 0:
|
||||
continue
|
||||
|
||||
risk_rewards.append(reward / risk)
|
||||
|
||||
return np.mean(risk_rewards) if risk_rewards else 0.0
|
||||
|
||||
def _format_top_signals(self) -> str:
|
||||
"""Format signal analysis for report"""
|
||||
all_signals = []
|
||||
@ -340,3 +415,27 @@ End: {self.predictions_df['prediction_timestamp'].max()}
|
||||
"""
|
||||
|
||||
return existing_report + trading_metrics
|
||||
|
||||
def _calculate_high_confidence_accuracy(self, y_true: pd.Series, y_pred: pd.Series) -> float:
|
||||
"""
|
||||
Calculate accuracy for high confidence predictions (confidence >= 0.8)
|
||||
|
||||
Args:
|
||||
y_true (pd.Series): True labels
|
||||
y_pred (pd.Series): Predicted labels
|
||||
|
||||
Returns:
|
||||
float: Accuracy score for high confidence predictions
|
||||
"""
|
||||
# Get high confidence mask
|
||||
high_conf_mask = self.predictions_df['confidence_score'] >= 0.8
|
||||
|
||||
if not high_conf_mask.any():
|
||||
return 0.0
|
||||
|
||||
# Filter predictions by confidence
|
||||
high_conf_true = y_true[high_conf_mask]
|
||||
high_conf_pred = y_pred[high_conf_mask]
|
||||
|
||||
# Calculate accuracy
|
||||
return accuracy_score(high_conf_true, high_conf_pred)
|
||||
@ -63,14 +63,143 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Processing: 58%|████████████████▉ | 142/244 [10:00<18:10, 10.69s/it]"
|
||||
"Processing: 100%|█████████████████████████████| 244/244 [16:34<00:00, 4.08s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Debug Counts:\n",
|
||||
"Initial DataFrame rows: 244\n",
|
||||
"Skipped: No next VWAP for timestamp 2025-01-31 16:00:00+00:00\n",
|
||||
"\n",
|
||||
"Processing Summary:\n",
|
||||
"Total rows initially: 244\n",
|
||||
"Valid predictions: 243\n",
|
||||
"Skipped timestamps: 1\n",
|
||||
"Final predictions count: 243\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Class distributions:\n",
|
||||
"Actual: {'down': 171, 'up': 72}\n",
|
||||
"Predicted: {'down': 164, 'up': 79}\n",
|
||||
"\n",
|
||||
"Confusion Matrix:\n",
|
||||
" Pred Down Pred Up\n",
|
||||
"True Down 137 34\n",
|
||||
"True Up 27 45\n",
|
||||
"\n",
|
||||
"Performance Report:\n",
|
||||
"\n",
|
||||
"Performance Report\n",
|
||||
"=================\n",
|
||||
"Total Predictions: 243\n",
|
||||
"Accuracy: 74.90%\n",
|
||||
"Precision: 56.96%\n",
|
||||
"Recall: 62.50%\n",
|
||||
"F1 Score: 59.60%\n",
|
||||
"\n",
|
||||
"Direction Distribution:\n",
|
||||
"-------------------\n",
|
||||
"Up: 79\n",
|
||||
"Down: 164\n",
|
||||
"\n",
|
||||
"Confidence Analysis:\n",
|
||||
"-----------------\n",
|
||||
"Average Confidence: 80.08%\n",
|
||||
"High Confidence Accuracy: 74.90%\n",
|
||||
"\n",
|
||||
"Trading Metrics:\n",
|
||||
"--------------\n",
|
||||
"Avg Expected VWAP Change: 0.12%\n",
|
||||
"Avg Volatility Estimate: 198.74%\n",
|
||||
"\n",
|
||||
"Price Target Analysis:\n",
|
||||
"-------------------\n",
|
||||
"Entry Success Rate: 74.90%\n",
|
||||
"Stop Loss Hits: 9.05%\n",
|
||||
"Take Profit Hits: 9.47%\n",
|
||||
"Avg Risk/Reward Ratio: 1.02\n",
|
||||
"\n",
|
||||
"Top Signals:\n",
|
||||
"----------\n",
|
||||
"Decreasing volume trend: 151\n",
|
||||
"Price below VWAP: 127\n",
|
||||
"Increasing volume suggests strong momentum.: 69\n",
|
||||
"Price above VWAP supports bullish momentum.: 68\n",
|
||||
"Decreasing volume suggests potential trend weakness.: 16\n",
|
||||
"\n",
|
||||
"Time Coverage:\n",
|
||||
"-----------\n",
|
||||
"Start: 2025-01-28 15:30:00+00:00\n",
|
||||
"End: 2025-01-31 15:55:00+00:00\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Predictions Summary:\n",
|
||||
" vwap_direction_next_5min confidence_score expected_vwap_change \\\n",
|
||||
"0 down 0.8 0.000000 \n",
|
||||
"1 up 0.8 0.000433 \n",
|
||||
"2 up 0.8 0.045066 \n",
|
||||
"3 down 0.8 0.000000 \n",
|
||||
"4 down 0.8 0.000000 \n",
|
||||
"\n",
|
||||
" volatility_estimate suggested_entry suggested_stop_loss \\\n",
|
||||
"0 0.000000 102547.785632 102849.709244 \n",
|
||||
"1 0.102457 102830.485685 102778.485685 \n",
|
||||
"2 10.792993 103057.394632 102757.894632 \n",
|
||||
"3 7.125226 103057.394967 103357.394967 \n",
|
||||
"4 6.469257 103057.394968 103357.394968 \n",
|
||||
"\n",
|
||||
" suggested_take_profit key_signals \\\n",
|
||||
"0 102245.862020 [Decreasing volume trend, VWAP below price] \n",
|
||||
"1 102882.485685 [Increasing volume suggests strong momentum., ... \n",
|
||||
"2 103356.894632 [Increasing volume suggests strong momentum., ... \n",
|
||||
"3 102757.394967 [Decreasing volume trend, Price below VWAP] \n",
|
||||
"4 102757.394968 [Decreasing volume trend, Price below VWAP, MA... \n",
|
||||
"\n",
|
||||
" reasoning \\\n",
|
||||
"0 The decreasing volume trend suggests potential... \n",
|
||||
"1 The increasing volume indicates strong momentu... \n",
|
||||
"2 The significant increase in volume indicates s... \n",
|
||||
"3 The decreasing volume trend suggests potential... \n",
|
||||
"4 The decreasing volume trend suggests potential... \n",
|
||||
"\n",
|
||||
" timestamp_prediction historical_start \\\n",
|
||||
"0 2025-01-28 15:30:00+00:00 2025-01-28 09:30:00+00:00 \n",
|
||||
"1 2025-01-28 15:35:00+00:00 2025-01-28 09:35:00+00:00 \n",
|
||||
"2 2025-01-28 15:40:00+00:00 2025-01-28 09:40:00+00:00 \n",
|
||||
"3 2025-01-28 15:45:00+00:00 2025-01-28 09:45:00+00:00 \n",
|
||||
"4 2025-01-28 15:50:00+00:00 2025-01-28 09:50:00+00:00 \n",
|
||||
"\n",
|
||||
" historical_end current_window_start \\\n",
|
||||
"0 2025-01-28 14:25:00+00:00 2025-01-28 14:30:00+00:00 \n",
|
||||
"1 2025-01-28 14:30:00+00:00 2025-01-28 14:35:00+00:00 \n",
|
||||
"2 2025-01-28 14:35:00+00:00 2025-01-28 14:40:00+00:00 \n",
|
||||
"3 2025-01-28 14:40:00+00:00 2025-01-28 14:45:00+00:00 \n",
|
||||
"4 2025-01-28 14:45:00+00:00 2025-01-28 14:50:00+00:00 \n",
|
||||
"\n",
|
||||
" current_window_end prediction_timestamp actual_movement \n",
|
||||
"0 2025-01-28 15:25:00+00:00 2025-01-28 15:30:00+00:00 up \n",
|
||||
"1 2025-01-28 15:30:00+00:00 2025-01-28 15:35:00+00:00 down \n",
|
||||
"2 2025-01-28 15:35:00+00:00 2025-01-28 15:40:00+00:00 down \n",
|
||||
"3 2025-01-28 15:40:00+00:00 2025-01-28 15:45:00+00:00 down \n",
|
||||
"4 2025-01-28 15:45:00+00:00 2025-01-28 15:50:00+00:00 down \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -136,6 +265,26 @@
|
||||
" print(f\"Analysis failed: {str(e)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-hggoauCsSRZHZkQyPoPRn05W', created_at=1738549484, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:yasha-sheynin::AwgWhL48', finished_at=1738549856, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-EfEdakLI3PxeXpcffWaFxdol', result_files=['file-Cr1t8xFGTdjpjQCXBGqE5L'], seed=288523345, status='succeeded', trained_tokens=58239, training_file='file-WwWvai4rxePmvifhN2KVmz', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=1, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix=None), FineTuningJob(id='ftjob-PS4EXlVz5SdInps2MU4f64B2', created_at=1738549280, error=Error(code='invalid_n_examples', message='Training file has 4 example(s), but must have at least 10 examples', param='training_file'), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-EfEdakLI3PxeXpcffWaFxdol', result_files=[], seed=912538967, status='failed', trained_tokens=None, training_file='file-2YxyNCNuFnLeoyCsmwRFh4', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=3)), type='supervised'), user_provided_suffix=None), FineTuningJob(id='ftjob-CdU86w4P5d5sAIeW2exVJJPo', created_at=1738524908, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:yasha-sheynin::Awacdfg6', finished_at=1738527160, hyperparameters=Hyperparameters(batch_size=3, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-EfEdakLI3PxeXpcffWaFxdol', result_files=['file-6DUBwAYAsFk94P8Qe8n8nL'], seed=223641031, status='succeeded', trained_tokens=1320606, training_file='file-BNa5KfcVuuSY9HmbwwXoWb', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=3, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix=None)], object='list', has_more=False)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from market_predictor.config import OPENAI_API_KEY\n",
|
||||
"from openai import OpenAI\n",
|
||||
"client = OpenAI(api_key = OPENAI_API_KEY)\n",
|
||||
"print(client.fine_tuning.jobs.list(limit=10))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
1667
training_data.jsonl
1667
training_data.jsonl
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user