Source code for neurological_lrd_analysis.ml_baselines.benchmark_comparison

"""
Comprehensive Benchmark Comparison: Classical vs ML Models.

This module provides a comprehensive benchmarking framework for comparing
classical Hurst estimation methods with machine learning baseline models.

Author: Davian R. Chin (PhD Candidate in Biomedical Engineering, University of Reading, UK)
"""

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Optional, Tuple, Union, Any
from dataclasses import dataclass
import time
import warnings
from pathlib import Path
import json

# Import classical methods
from ..biomedical_hurst_factory import (
    BiomedicalHurstEstimatorFactory, EstimatorType, HurstResult
)

# Import ML methods
from .pretrained_models import PretrainedModelManager, ModelStatus
from .inference import PretrainedInference, quick_predict, quick_ensemble_predict

# Import data generation
from ..benchmark_core.generation import (
    generate_grid, fbm_davies_harte, TimeSeriesSample
)


@dataclass
class BenchmarkResult:
    """Result from a single benchmark test."""
    method_name: str
    method_type: str  # 'classical' or 'ml'
    hurst_estimate: float
    confidence_interval: Optional[Tuple[float, float]] = None
    computation_time: float = 0.0
    error: Optional[float] = None
    metadata: Optional[Dict[str, Any]] = None


@dataclass
class BenchmarkSummary:
    """Summary of benchmark results for a method."""
    method_name: str
    method_type: str
    n_tests: int
    mean_error: float
    std_error: float
    mean_absolute_error: float
    root_mean_squared_error: float
    correlation: float
    mean_computation_time: float
    success_rate: float
    results: List[BenchmarkResult]


[docs] class ClassicalMLBenchmark: """ Comprehensive benchmark comparing classical and ML methods. Provides systematic comparison of classical Hurst estimation methods with machine learning baseline models across various test scenarios. """
[docs] def __init__(self, pretrained_models_dir: Union[str, Path] = "pretrained_models", classical_estimators: Optional[List[EstimatorType]] = None, ml_estimators: Optional[List[str]] = None): """ Initialize the benchmark system. Parameters: ----------- pretrained_models_dir : str or Path Directory containing pretrained ML models classical_estimators : List[EstimatorType], optional Classical estimators to include ml_estimators : List[str], optional ML model types to include """ self.pretrained_models_dir = Path(pretrained_models_dir) # Initialize classical estimator factory self.classical_factory = BiomedicalHurstEstimatorFactory() # Initialize ML inference system self.ml_inference = PretrainedInference(pretrained_models_dir) # Default classical estimators if classical_estimators is None: self.classical_estimators = [ EstimatorType.DFA, EstimatorType.RS_ANALYSIS, EstimatorType.HIGUCHI, EstimatorType.GENERALIZED_HURST, EstimatorType.PERIODOGRAM, EstimatorType.GPH, EstimatorType.WHITTLE_MLE, EstimatorType.DWT, EstimatorType.ABRY_VEITCH, EstimatorType.MFDFA ] else: self.classical_estimators = classical_estimators # Default ML estimators if ml_estimators is None: self.ml_estimators = ['random_forest', 'svr', 'gradient_boosting'] else: self.ml_estimators = ml_estimators # Results storage self.benchmark_results = {} self.summary_results = {}
[docs] def create_test_scenarios(self, hurst_values: List[float] = None, lengths: List[int] = None, n_samples_per_config: int = 10, include_contamination: bool = True, include_biomedical: bool = True) -> List[TimeSeriesSample]: """ Create comprehensive test scenarios. Parameters: ----------- hurst_values : List[float], optional Hurst values to test lengths : List[int], optional Time series lengths n_samples_per_config : int Number of samples per configuration include_contamination : bool Whether to include contaminated data include_biomedical : bool Whether to include biomedical scenarios Returns: -------- List[TimeSeriesSample] Test scenarios """ if hurst_values is None: hurst_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] if lengths is None: lengths = [500, 1000, 2000] # Base generators generators = ['fbm', 'fgn', 'arfima', 'mrw', 'fou'] # Contamination types contaminations = ['none'] if include_contamination: contaminations.extend(['noise', 'missing', 'artifacts']) # Biomedical scenarios biomedical_scenarios = None if include_biomedical: biomedical_scenarios = ['eeg', 'ecg', 'respiratory'] print(f"Creating test scenarios...") print(f" - Hurst values: {hurst_values}") print(f" - Lengths: {lengths}") print(f" - Generators: {generators}") print(f" - Contaminations: {contaminations}") print(f" - Biomedical scenarios: {biomedical_scenarios}") print(f" - Samples per config: {n_samples_per_config}") # Generate test scenarios samples = generate_grid( hurst_values=hurst_values, lengths=lengths, contaminations=contaminations, generators=generators, biomedical_scenarios=biomedical_scenarios ) print(f"Generated {len(samples)} test scenarios") return samples
[docs] def benchmark_classical_methods(self, samples: List[TimeSeriesSample]) -> Dict[str, List[BenchmarkResult]]: """ Benchmark classical Hurst estimation methods. Parameters: ----------- samples : List[TimeSeriesSample] Test scenarios Returns: -------- Dict[str, List[BenchmarkResult]] Results for each classical method """ print(f"\nBenchmarking classical methods...") print(f" - Methods: {[e.value for e in self.classical_estimators]}") print(f" - Test scenarios: {len(samples)}") results = {} for estimator_type in self.classical_estimators: print(f"\n Testing {estimator_type.value}...") method_results = [] try: for i, sample in enumerate(samples): if i % 50 == 0 and i > 0: print(f" Processed {i}/{len(samples)} samples") try: start_time = time.time() # Estimate Hurst exponent using factory directly result = self.classical_factory.estimate(sample.data, estimator_type) computation_time = time.time() - start_time # Calculate error error = abs(result.hurst_estimate - sample.true_hurst) # Create benchmark result benchmark_result = BenchmarkResult( method_name=estimator_type.value, method_type='classical', hurst_estimate=result.hurst_estimate, confidence_interval=result.confidence_interval, computation_time=computation_time, error=error, metadata={ 'true_hurst': sample.true_hurst, 'data_length': len(sample.data), 'generator': sample.generator, 'contamination': sample.contamination, 'biomedical_scenario': sample.biomedical_scenario } ) method_results.append(benchmark_result) except Exception as e: warnings.warn(f"Failed to estimate with {estimator_type.value} for sample {i}: {e}") continue results[estimator_type.value] = method_results print(f" Completed: {len(method_results)} successful estimates") except Exception as e: print(f" Failed to create estimator {estimator_type.value}: {e}") results[estimator_type.value] = [] return results
[docs] def benchmark_ml_methods(self, samples: List[TimeSeriesSample]) -> Dict[str, List[BenchmarkResult]]: """ Benchmark machine learning methods. Parameters: ----------- samples : List[TimeSeriesSample] Test scenarios Returns: -------- Dict[str, List[BenchmarkResult]] Results for each ML method """ print(f"\nBenchmarking ML methods...") print(f" - Methods: {self.ml_estimators}") print(f" - Test scenarios: {len(samples)}") results = {} for ml_method in self.ml_estimators: print(f"\n Testing {ml_method}...") method_results = [] try: for i, sample in enumerate(samples): if i % 50 == 0 and i > 0: print(f" Processed {i}/{len(samples)} samples") try: start_time = time.time() # Predict using ML method if ml_method == 'ensemble': # Use ensemble prediction mean_est, std_est = quick_ensemble_predict( sample.data, self.pretrained_models_dir ) hurst_estimate = mean_est confidence_interval = (mean_est - std_est, mean_est + std_est) else: # Use single model prediction hurst_estimate = quick_predict( sample.data, self.pretrained_models_dir, ml_method ) confidence_interval = None computation_time = time.time() - start_time # Calculate error error = abs(hurst_estimate - sample.true_hurst) # Create benchmark result benchmark_result = BenchmarkResult( method_name=ml_method, method_type='ml', hurst_estimate=hurst_estimate, confidence_interval=confidence_interval, computation_time=computation_time, error=error, metadata={ 'true_hurst': sample.true_hurst, 'data_length': len(sample.data), 'generator': sample.generator, 'contamination': sample.contamination, 'biomedical_scenario': sample.biomedical_scenario } ) method_results.append(benchmark_result) except Exception as e: warnings.warn(f"Failed to predict with {ml_method} for sample {i}: {e}") continue results[ml_method] = method_results print(f" Completed: {len(method_results)} successful predictions") except Exception as e: print(f" Failed to benchmark {ml_method}: {e}") results[ml_method] = [] return results
[docs] def run_comprehensive_benchmark(self, test_scenarios: Optional[List[TimeSeriesSample]] = None, save_results: bool = True, results_dir: Union[str, Path] = "benchmark_results") -> Dict[str, Any]: """ Run comprehensive benchmark comparison. Parameters: ----------- test_scenarios : List[TimeSeriesSample], optional Test scenarios to use save_results : bool Whether to save results to disk results_dir : str or Path Directory to save results Returns: -------- Dict[str, Any] Complete benchmark results """ print("=" * 80) print("COMPREHENSIVE CLASSICAL vs ML BENCHMARK") print("=" * 80) print("Author: Davian R. Chin (PhD Candidate in Biomedical Engineering, University of Reading, UK)") print("=" * 80) # Create test scenarios if not provided if test_scenarios is None: test_scenarios = self.create_test_scenarios() print(f"\nBenchmarking {len(test_scenarios)} test scenarios") print(f"Classical methods: {len(self.classical_estimators)}") print(f"ML methods: {len(self.ml_estimators)}") # Benchmark classical methods print(f"\n{'='*60}") print("CLASSICAL METHODS BENCHMARK") print(f"{'='*60}") classical_results = self.benchmark_classical_methods(test_scenarios) # Benchmark ML methods print(f"\n{'='*60}") print("ML METHODS BENCHMARK") print(f"{'='*60}") ml_results = self.benchmark_ml_methods(test_scenarios) # Combine results all_results = {**classical_results, **ml_results} # Calculate summaries print(f"\n{'='*60}") print("CALCULATING PERFORMANCE SUMMARIES") print(f"{'='*60}") summaries = self.calculate_summaries(all_results) # Create comprehensive results benchmark_data = { 'test_scenarios': test_scenarios, 'classical_results': classical_results, 'ml_results': ml_results, 'all_results': all_results, 'summaries': summaries, 'benchmark_config': { 'classical_estimators': [e.value for e in self.classical_estimators], 'ml_estimators': self.ml_estimators, 'n_test_scenarios': len(test_scenarios) } } # Save results if requested if save_results: self.save_benchmark_results(benchmark_data, results_dir) # Print summary self.print_benchmark_summary(summaries) return benchmark_data
[docs] def calculate_summaries(self, results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]: """Calculate performance summaries for all methods.""" summaries = {} for method_name, method_results in results.items(): if not method_results: continue # Extract data errors = [r.error for r in method_results if r.error is not None] computation_times = [r.computation_time for r in method_results] if not errors: continue # Calculate metrics mean_error = np.mean(errors) std_error = np.std(errors) mean_absolute_error = np.mean(np.abs(errors)) root_mean_squared_error = np.sqrt(np.mean(np.array(errors)**2)) # Calculate correlation with true values true_hurst = [r.metadata['true_hurst'] for r in method_results if r.metadata] estimates = [r.hurst_estimate for r in method_results] if len(true_hurst) > 1 and len(estimates) > 1: correlation = np.corrcoef(true_hurst, estimates)[0, 1] else: correlation = 0.0 # Calculate success rate success_rate = len(method_results) / len(method_results) if method_results else 0.0 # Determine method type method_type = 'classical' if method_name in [e.value for e in self.classical_estimators] else 'ml' # Create summary summary = BenchmarkSummary( method_name=method_name, method_type=method_type, n_tests=len(method_results), mean_error=mean_error, std_error=std_error, mean_absolute_error=mean_absolute_error, root_mean_squared_error=root_mean_squared_error, correlation=correlation, mean_computation_time=np.mean(computation_times), success_rate=success_rate, results=method_results ) summaries[method_name] = summary return summaries
[docs] def print_benchmark_summary(self, summaries: Dict[str, BenchmarkSummary]) -> None: """Print benchmark summary.""" print(f"\n{'='*80}") print("BENCHMARK SUMMARY") print(f"{'='*80}") # Sort by mean absolute error sorted_summaries = sorted(summaries.items(), key=lambda x: x[1].mean_absolute_error) print(f"\n{'Method':<20} {'Type':<10} {'MAE':<8} {'RMSE':<8} {'Corr':<8} {'Time(ms)':<10} {'Success':<8}") print(f"{'-'*80}") for method_name, summary in sorted_summaries: print(f"{method_name:<20} {summary.method_type:<10} " f"{summary.mean_absolute_error:<8.4f} {summary.root_mean_squared_error:<8.4f} " f"{summary.correlation:<8.4f} {summary.mean_computation_time*1000:<10.1f} " f"{summary.success_rate:<8.2f}") # Best performers print(f"\n{'='*60}") print("BEST PERFORMERS") print(f"{'='*60}") # Best overall if summaries: best_overall = min(summaries.items(), key=lambda x: x[1].mean_absolute_error) print(f"Best Overall: {best_overall[0]} (MAE: {best_overall[1].mean_absolute_error:.4f})") # Best classical classical_summaries = {k: v for k, v in summaries.items() if v.method_type == 'classical'} if classical_summaries: best_classical = min(classical_summaries.items(), key=lambda x: x[1].mean_absolute_error) print(f"Best Classical: {best_classical[0]} (MAE: {best_classical[1].mean_absolute_error:.4f})") # Best ML ml_summaries = {k: v for k, v in summaries.items() if v.method_type == 'ml'} if ml_summaries: best_ml = min(ml_summaries.items(), key=lambda x: x[1].mean_absolute_error) print(f"Best ML: {best_ml[0]} (MAE: {best_ml[1].mean_absolute_error:.4f})") else: print("No successful benchmark results to summarize.") # Speed comparison print(f"\n{'='*60}") print("SPEED COMPARISON") print(f"{'='*60}") if summaries: speed_sorted = sorted(summaries.items(), key=lambda x: x[1].mean_computation_time) print(f"Fastest: {speed_sorted[0][0]} ({speed_sorted[0][1].mean_computation_time*1000:.1f}ms)") print(f"Slowest: {speed_sorted[-1][0]} ({speed_sorted[-1][1].mean_computation_time*1000:.1f}ms)") else: print("No successful benchmark results for speed comparison.")
[docs] def save_benchmark_results(self, benchmark_data: Dict[str, Any], results_dir: Union[str, Path]) -> None: """Save benchmark results to disk.""" results_dir = Path(results_dir) results_dir.mkdir(parents=True, exist_ok=True) # Save summary data summary_data = {} for method_name, summary in benchmark_data['summaries'].items(): summary_data[method_name] = { 'method_name': summary.method_name, 'method_type': summary.method_type, 'n_tests': summary.n_tests, 'mean_error': summary.mean_error, 'std_error': summary.std_error, 'mean_absolute_error': summary.mean_absolute_error, 'root_mean_squared_error': summary.root_mean_squared_error, 'correlation': summary.correlation, 'mean_computation_time': summary.mean_computation_time, 'success_rate': summary.success_rate } # Save JSON summary with open(results_dir / "benchmark_summary.json", 'w') as f: json.dump(summary_data, f, indent=2) # Save detailed results as CSV all_results = [] for method_name, method_results in benchmark_data['all_results'].items(): for result in method_results: all_results.append({ 'method_name': result.method_name, 'method_type': result.method_type, 'hurst_estimate': result.hurst_estimate, 'true_hurst': result.metadata['true_hurst'], 'error': result.error, 'computation_time': result.computation_time, 'data_length': result.metadata['data_length'], 'generator': result.metadata['generator'], 'contamination': result.metadata['contamination'], 'biomedical_scenario': result.metadata['biomedical_scenario'] }) df = pd.DataFrame(all_results) df.to_csv(results_dir / "benchmark_results.csv", index=False) print(f"\nBenchmark results saved to: {results_dir}")
[docs] def create_visualizations(self, benchmark_data: Dict[str, Any], save_path: Optional[Path] = None) -> None: """Create comprehensive visualizations of benchmark results.""" print(f"\nCreating benchmark visualizations...") # Set up plotting plt.style.use('seaborn-v0_8') sns.set_palette("husl") # Create figure fig, axes = plt.subplots(2, 3, figsize=(18, 12)) fig.suptitle('Classical vs ML Methods: Comprehensive Benchmark', fontsize=16, fontweight='bold') summaries = benchmark_data['summaries'] # 1. Performance comparison (MAE) ax1 = axes[0, 0] methods = list(summaries.keys()) mae_values = [summaries[m].mean_absolute_error for m in methods] colors = ['red' if summaries[m].method_type == 'classical' else 'blue' for m in methods] bars = ax1.barh(methods, mae_values, color=colors, alpha=0.7) ax1.set_xlabel('Mean Absolute Error') ax1.set_title('Performance Comparison (MAE)') ax1.grid(True, alpha=0.3) # Add legend from matplotlib.patches import Patch legend_elements = [Patch(facecolor='red', alpha=0.7, label='Classical'), Patch(facecolor='blue', alpha=0.7, label='ML')] ax1.legend(handles=legend_elements) # 2. Speed comparison ax2 = axes[0, 1] speed_values = [summaries[m].mean_computation_time * 1000 for m in methods] # Convert to ms bars = ax2.barh(methods, speed_values, color=colors, alpha=0.7) ax2.set_xlabel('Computation Time (ms)') ax2.set_title('Speed Comparison') ax2.grid(True, alpha=0.3) # 3. Correlation comparison ax3 = axes[0, 2] corr_values = [summaries[m].correlation for m in methods] bars = ax3.barh(methods, corr_values, color=colors, alpha=0.7) ax3.set_xlabel('Correlation with True Values') ax3.set_title('Accuracy (Correlation)') ax3.grid(True, alpha=0.3) # 4. Error distribution ax4 = axes[1, 0] classical_errors = [] ml_errors = [] for method_name, summary in summaries.items(): if summary.method_type == 'classical': classical_errors.extend([r.error for r in summary.results if r.error is not None]) else: ml_errors.extend([r.error for r in summary.results if r.error is not None]) if classical_errors and ml_errors: ax4.hist(classical_errors, bins=30, alpha=0.7, label='Classical', color='red', density=True) ax4.hist(ml_errors, bins=30, alpha=0.7, label='ML', color='blue', density=True) ax4.set_xlabel('Absolute Error') ax4.set_ylabel('Density') ax4.set_title('Error Distribution') ax4.legend() ax4.grid(True, alpha=0.3) # 5. Performance vs Speed scatter ax5 = axes[1, 1] mae_values = [summaries[m].mean_absolute_error for m in methods] speed_values = [summaries[m].mean_computation_time * 1000 for m in methods] for i, method in enumerate(methods): color = 'red' if summaries[method].method_type == 'classical' else 'blue' ax5.scatter(speed_values[i], mae_values[i], color=color, alpha=0.7, s=100) ax5.annotate(method, (speed_values[i], mae_values[i]), xytext=(5, 5), textcoords='offset points', fontsize=8) ax5.set_xlabel('Computation Time (ms)') ax5.set_ylabel('Mean Absolute Error') ax5.set_title('Performance vs Speed') ax5.grid(True, alpha=0.3) # 6. Success rate comparison ax6 = axes[1, 2] success_rates = [summaries[m].success_rate for m in methods] bars = ax6.barh(methods, success_rates, color=colors, alpha=0.7) ax6.set_xlabel('Success Rate') ax6.set_title('Reliability (Success Rate)') ax6.grid(True, alpha=0.3) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') print(f"Visualizations saved to: {save_path}") plt.show()
[docs] def run_comprehensive_benchmark(pretrained_models_dir: Union[str, Path] = "pretrained_models", results_dir: Union[str, Path] = "benchmark_results", test_scenarios: Optional[List[TimeSeriesSample]] = None) -> Dict[str, Any]: """ Run comprehensive benchmark comparison. Parameters: ----------- pretrained_models_dir : str or Path Directory containing pretrained models results_dir : str or Path Directory to save results test_scenarios : List[TimeSeriesSample], optional Test scenarios to use Returns: -------- Dict[str, Any] Complete benchmark results """ # Create benchmark system benchmark = ClassicalMLBenchmark(pretrained_models_dir) # Run comprehensive benchmark results = benchmark.run_comprehensive_benchmark( test_scenarios=test_scenarios, save_results=True, results_dir=results_dir ) # Create visualizations results_path = Path(results_dir) benchmark.create_visualizations(results, results_path / "benchmark_visualization.png") return results
if __name__ == "__main__": # Run comprehensive benchmark results = run_comprehensive_benchmark() print("\nBenchmark completed successfully!")