Source code for neurological_lrd_analysis.ml_baselines.benchmark_comparison

"""
Comprehensive Benchmark Comparison: Classical vs ML Models.

This module provides a comprehensive benchmarking framework for comparing
classical Hurst estimation methods with machine learning baseline models.

Author: Davian R. Chin (PhD Candidate in Biomedical Engineering, University of Reading, UK)
"""

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Optional, Tuple, Union, Any
from dataclasses import dataclass
import time
import warnings
from pathlib import Path
import json

# Import classical methods
from ..biomedical_hurst_factory import (
    BiomedicalHurstEstimatorFactory, EstimatorType, HurstResult
)

# Import ML methods
from .pretrained_models import PretrainedModelManager, ModelStatus
from .inference import PretrainedInference, quick_predict, quick_ensemble_predict

# Import data generation
from ..benchmark_core.generation import (
    generate_grid, fbm_davies_harte, TimeSeriesSample
)


@dataclass
class BenchmarkResult:
    """Result from a single benchmark test."""
    method_name: str
    method_type: str  # 'classical' or 'ml'
    hurst_estimate: float
    confidence_interval: Optional[Tuple[float, float]] = None
    computation_time: float = 0.0
    error: Optional[float] = None
    metadata: Optional[Dict[str, Any]] = None


@dataclass
class BenchmarkSummary:
    """Summary of benchmark results for a method."""
    method_name: str
    method_type: str
    n_tests: int
    mean_error: float
    std_error: float
    mean_absolute_error: float
    root_mean_squared_error: float
    correlation: float
    mean_computation_time: float
    success_rate: float
    results: List[BenchmarkResult]



[docs]
class ClassicalMLBenchmark:
    """
    Comprehensive benchmark comparing classical and ML methods.
    
    Provides systematic comparison of classical Hurst estimation methods
    with machine learning baseline models across various test scenarios.
    """
    

[docs]
    def __init__(self, 
                 pretrained_models_dir: Union[str, Path] = "pretrained_models",
                 classical_estimators: Optional[List[EstimatorType]] = None,
                 ml_estimators: Optional[List[str]] = None):
        """
        Initialize the benchmark system.
        
        Parameters:
        -----------
        pretrained_models_dir : str or Path
            Directory containing pretrained ML models
        classical_estimators : List[EstimatorType], optional
            Classical estimators to include
        ml_estimators : List[str], optional
            ML model types to include
        """
        self.pretrained_models_dir = Path(pretrained_models_dir)
        
        # Initialize classical estimator factory
        self.classical_factory = BiomedicalHurstEstimatorFactory()
        
        # Initialize ML inference system
        self.ml_inference = PretrainedInference(pretrained_models_dir)
        
        # Default classical estimators
        if classical_estimators is None:
            self.classical_estimators = [
                EstimatorType.DFA,
                EstimatorType.RS_ANALYSIS,
                EstimatorType.HIGUCHI,
                EstimatorType.GENERALIZED_HURST,
                EstimatorType.PERIODOGRAM,
                EstimatorType.GPH,
                EstimatorType.WHITTLE_MLE,
                EstimatorType.DWT,
                EstimatorType.ABRY_VEITCH,
                EstimatorType.MFDFA
            ]
        else:
            self.classical_estimators = classical_estimators
        
        # Default ML estimators
        if ml_estimators is None:
            self.ml_estimators = ['random_forest', 'svr', 'gradient_boosting']
        else:
            self.ml_estimators = ml_estimators
        
        # Results storage
        self.benchmark_results = {}
        self.summary_results = {}

    

[docs]
    def create_test_scenarios(self, 
                            hurst_values: List[float] = None,
                            lengths: List[int] = None,
                            n_samples_per_config: int = 10,
                            include_contamination: bool = True,
                            include_biomedical: bool = True) -> List[TimeSeriesSample]:
        """
        Create comprehensive test scenarios.
        
        Parameters:
        -----------
        hurst_values : List[float], optional
            Hurst values to test
        lengths : List[int], optional
            Time series lengths
        n_samples_per_config : int
            Number of samples per configuration
        include_contamination : bool
            Whether to include contaminated data
        include_biomedical : bool
            Whether to include biomedical scenarios
            
        Returns:
        --------
        List[TimeSeriesSample]
            Test scenarios
        """
        if hurst_values is None:
            hurst_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        
        if lengths is None:
            lengths = [500, 1000, 2000]
        
        # Base generators
        generators = ['fbm', 'fgn', 'arfima', 'mrw', 'fou']
        
        # Contamination types
        contaminations = ['none']
        if include_contamination:
            contaminations.extend(['noise', 'missing', 'artifacts'])
        
        # Biomedical scenarios
        biomedical_scenarios = None
        if include_biomedical:
            biomedical_scenarios = ['eeg', 'ecg', 'respiratory']
        
        print(f"Creating test scenarios...")
        print(f"  - Hurst values: {hurst_values}")
        print(f"  - Lengths: {lengths}")
        print(f"  - Generators: {generators}")
        print(f"  - Contaminations: {contaminations}")
        print(f"  - Biomedical scenarios: {biomedical_scenarios}")
        print(f"  - Samples per config: {n_samples_per_config}")
        
        # Generate test scenarios
        samples = generate_grid(
            hurst_values=hurst_values,
            lengths=lengths,
            contaminations=contaminations,
            generators=generators,
            biomedical_scenarios=biomedical_scenarios
        )
        
        print(f"Generated {len(samples)} test scenarios")
        return samples

    

[docs]
    def benchmark_classical_methods(self, 
                                  samples: List[TimeSeriesSample]) -> Dict[str, List[BenchmarkResult]]:
        """
        Benchmark classical Hurst estimation methods.
        
        Parameters:
        -----------
        samples : List[TimeSeriesSample]
            Test scenarios
            
        Returns:
        --------
        Dict[str, List[BenchmarkResult]]
            Results for each classical method
        """
        print(f"\nBenchmarking classical methods...")
        print(f"  - Methods: {[e.value for e in self.classical_estimators]}")
        print(f"  - Test scenarios: {len(samples)}")
        
        results = {}
        
        for estimator_type in self.classical_estimators:
            print(f"\n  Testing {estimator_type.value}...")
            method_results = []
            
            try:
                for i, sample in enumerate(samples):
                    if i % 50 == 0 and i > 0:
                        print(f"    Processed {i}/{len(samples)} samples")
                    
                    try:
                        start_time = time.time()
                        
                        # Estimate Hurst exponent using factory directly
                        result = self.classical_factory.estimate(sample.data, estimator_type)
                        
                        computation_time = time.time() - start_time
                        
                        # Calculate error
                        error = abs(result.hurst_estimate - sample.true_hurst)
                        
                        # Create benchmark result
                        benchmark_result = BenchmarkResult(
                            method_name=estimator_type.value,
                            method_type='classical',
                            hurst_estimate=result.hurst_estimate,
                            confidence_interval=result.confidence_interval,
                            computation_time=computation_time,
                            error=error,
                            metadata={
                                'true_hurst': sample.true_hurst,
                                'data_length': len(sample.data),
                                'generator': sample.generator,
                                'contamination': sample.contamination,
                                'biomedical_scenario': sample.biomedical_scenario
                            }
                        )
                        
                        method_results.append(benchmark_result)
                        
                    except Exception as e:
                        warnings.warn(f"Failed to estimate with {estimator_type.value} for sample {i}: {e}")
                        continue
                
                results[estimator_type.value] = method_results
                print(f"    Completed: {len(method_results)} successful estimates")
                
            except Exception as e:
                print(f"    Failed to create estimator {estimator_type.value}: {e}")
                results[estimator_type.value] = []
        
        return results

    

[docs]
    def benchmark_ml_methods(self, 
                           samples: List[TimeSeriesSample]) -> Dict[str, List[BenchmarkResult]]:
        """
        Benchmark machine learning methods.
        
        Parameters:
        -----------
        samples : List[TimeSeriesSample]
            Test scenarios
            
        Returns:
        --------
        Dict[str, List[BenchmarkResult]]
            Results for each ML method
        """
        print(f"\nBenchmarking ML methods...")
        print(f"  - Methods: {self.ml_estimators}")
        print(f"  - Test scenarios: {len(samples)}")
        
        results = {}
        
        for ml_method in self.ml_estimators:
            print(f"\n  Testing {ml_method}...")
            method_results = []
            
            try:
                for i, sample in enumerate(samples):
                    if i % 50 == 0 and i > 0:
                        print(f"    Processed {i}/{len(samples)} samples")
                    
                    try:
                        start_time = time.time()
                        
                        # Predict using ML method
                        if ml_method == 'ensemble':
                            # Use ensemble prediction
                            mean_est, std_est = quick_ensemble_predict(
                                sample.data, self.pretrained_models_dir
                            )
                            hurst_estimate = mean_est
                            confidence_interval = (mean_est - std_est, mean_est + std_est)
                        else:
                            # Use single model prediction
                            hurst_estimate = quick_predict(
                                sample.data, self.pretrained_models_dir, ml_method
                            )
                            confidence_interval = None
                        
                        computation_time = time.time() - start_time
                        
                        # Calculate error
                        error = abs(hurst_estimate - sample.true_hurst)
                        
                        # Create benchmark result
                        benchmark_result = BenchmarkResult(
                            method_name=ml_method,
                            method_type='ml',
                            hurst_estimate=hurst_estimate,
                            confidence_interval=confidence_interval,
                            computation_time=computation_time,
                            error=error,
                            metadata={
                                'true_hurst': sample.true_hurst,
                                'data_length': len(sample.data),
                                'generator': sample.generator,
                                'contamination': sample.contamination,
                                'biomedical_scenario': sample.biomedical_scenario
                            }
                        )
                        
                        method_results.append(benchmark_result)
                        
                    except Exception as e:
                        warnings.warn(f"Failed to predict with {ml_method} for sample {i}: {e}")
                        continue
                
                results[ml_method] = method_results
                print(f"    Completed: {len(method_results)} successful predictions")
                
            except Exception as e:
                print(f"    Failed to benchmark {ml_method}: {e}")
                results[ml_method] = []
        
        return results

    

[docs]
    def run_comprehensive_benchmark(self, 
                                  test_scenarios: Optional[List[TimeSeriesSample]] = None,
                                  save_results: bool = True,
                                  results_dir: Union[str, Path] = "benchmark_results") -> Dict[str, Any]:
        """
        Run comprehensive benchmark comparison.
        
        Parameters:
        -----------
        test_scenarios : List[TimeSeriesSample], optional
            Test scenarios to use
        save_results : bool
            Whether to save results to disk
        results_dir : str or Path
            Directory to save results
            
        Returns:
        --------
        Dict[str, Any]
            Complete benchmark results
        """
        print("=" * 80)
        print("COMPREHENSIVE CLASSICAL vs ML BENCHMARK")
        print("=" * 80)
        print("Author: Davian R. Chin (PhD Candidate in Biomedical Engineering, University of Reading, UK)")
        print("=" * 80)
        
        # Create test scenarios if not provided
        if test_scenarios is None:
            test_scenarios = self.create_test_scenarios()
        
        print(f"\nBenchmarking {len(test_scenarios)} test scenarios")
        print(f"Classical methods: {len(self.classical_estimators)}")
        print(f"ML methods: {len(self.ml_estimators)}")
        
        # Benchmark classical methods
        print(f"\n{'='*60}")
        print("CLASSICAL METHODS BENCHMARK")
        print(f"{'='*60}")
        classical_results = self.benchmark_classical_methods(test_scenarios)
        
        # Benchmark ML methods
        print(f"\n{'='*60}")
        print("ML METHODS BENCHMARK")
        print(f"{'='*60}")
        ml_results = self.benchmark_ml_methods(test_scenarios)
        
        # Combine results
        all_results = {**classical_results, **ml_results}
        
        # Calculate summaries
        print(f"\n{'='*60}")
        print("CALCULATING PERFORMANCE SUMMARIES")
        print(f"{'='*60}")
        summaries = self.calculate_summaries(all_results)
        
        # Create comprehensive results
        benchmark_data = {
            'test_scenarios': test_scenarios,
            'classical_results': classical_results,
            'ml_results': ml_results,
            'all_results': all_results,
            'summaries': summaries,
            'benchmark_config': {
                'classical_estimators': [e.value for e in self.classical_estimators],
                'ml_estimators': self.ml_estimators,
                'n_test_scenarios': len(test_scenarios)
            }
        }
        
        # Save results if requested
        if save_results:
            self.save_benchmark_results(benchmark_data, results_dir)
        
        # Print summary
        self.print_benchmark_summary(summaries)
        
        return benchmark_data

    

[docs]
    def calculate_summaries(self, results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]:
        """Calculate performance summaries for all methods."""
        summaries = {}
        
        for method_name, method_results in results.items():
            if not method_results:
                continue
            
            # Extract data
            errors = [r.error for r in method_results if r.error is not None]
            computation_times = [r.computation_time for r in method_results]
            
            if not errors:
                continue
            
            # Calculate metrics
            mean_error = np.mean(errors)
            std_error = np.std(errors)
            mean_absolute_error = np.mean(np.abs(errors))
            root_mean_squared_error = np.sqrt(np.mean(np.array(errors)**2))
            
            # Calculate correlation with true values
            true_hurst = [r.metadata['true_hurst'] for r in method_results if r.metadata]
            estimates = [r.hurst_estimate for r in method_results]
            
            if len(true_hurst) > 1 and len(estimates) > 1:
                correlation = np.corrcoef(true_hurst, estimates)[0, 1]
            else:
                correlation = 0.0
            
            # Calculate success rate
            success_rate = len(method_results) / len(method_results) if method_results else 0.0
            
            # Determine method type
            method_type = 'classical' if method_name in [e.value for e in self.classical_estimators] else 'ml'
            
            # Create summary
            summary = BenchmarkSummary(
                method_name=method_name,
                method_type=method_type,
                n_tests=len(method_results),
                mean_error=mean_error,
                std_error=std_error,
                mean_absolute_error=mean_absolute_error,
                root_mean_squared_error=root_mean_squared_error,
                correlation=correlation,
                mean_computation_time=np.mean(computation_times),
                success_rate=success_rate,
                results=method_results
            )
            
            summaries[method_name] = summary
        
        return summaries

    

[docs]
    def print_benchmark_summary(self, summaries: Dict[str, BenchmarkSummary]) -> None:
        """Print benchmark summary."""
        print(f"\n{'='*80}")
        print("BENCHMARK SUMMARY")
        print(f"{'='*80}")
        
        # Sort by mean absolute error
        sorted_summaries = sorted(summaries.items(), key=lambda x: x[1].mean_absolute_error)
        
        print(f"\n{'Method':<20} {'Type':<10} {'MAE':<8} {'RMSE':<8} {'Corr':<8} {'Time(ms)':<10} {'Success':<8}")
        print(f"{'-'*80}")
        
        for method_name, summary in sorted_summaries:
            print(f"{method_name:<20} {summary.method_type:<10} "
                  f"{summary.mean_absolute_error:<8.4f} {summary.root_mean_squared_error:<8.4f} "
                  f"{summary.correlation:<8.4f} {summary.mean_computation_time*1000:<10.1f} "
                  f"{summary.success_rate:<8.2f}")
        
        # Best performers
        print(f"\n{'='*60}")
        print("BEST PERFORMERS")
        print(f"{'='*60}")
        
        # Best overall
        if summaries:
            best_overall = min(summaries.items(), key=lambda x: x[1].mean_absolute_error)
            print(f"Best Overall: {best_overall[0]} (MAE: {best_overall[1].mean_absolute_error:.4f})")
            
            # Best classical
            classical_summaries = {k: v for k, v in summaries.items() if v.method_type == 'classical'}
            if classical_summaries:
                best_classical = min(classical_summaries.items(), key=lambda x: x[1].mean_absolute_error)
                print(f"Best Classical: {best_classical[0]} (MAE: {best_classical[1].mean_absolute_error:.4f})")
            
            # Best ML
            ml_summaries = {k: v for k, v in summaries.items() if v.method_type == 'ml'}
            if ml_summaries:
                best_ml = min(ml_summaries.items(), key=lambda x: x[1].mean_absolute_error)
                print(f"Best ML: {best_ml[0]} (MAE: {best_ml[1].mean_absolute_error:.4f})")
        else:
            print("No successful benchmark results to summarize.")
        
        # Speed comparison
        print(f"\n{'='*60}")
        print("SPEED COMPARISON")
        print(f"{'='*60}")
        
        if summaries:
            speed_sorted = sorted(summaries.items(), key=lambda x: x[1].mean_computation_time)
            print(f"Fastest: {speed_sorted[0][0]} ({speed_sorted[0][1].mean_computation_time*1000:.1f}ms)")
            print(f"Slowest: {speed_sorted[-1][0]} ({speed_sorted[-1][1].mean_computation_time*1000:.1f}ms)")
        else:
            print("No successful benchmark results for speed comparison.")

    

[docs]
    def save_benchmark_results(self, 
                             benchmark_data: Dict[str, Any], 
                             results_dir: Union[str, Path]) -> None:
        """Save benchmark results to disk."""
        results_dir = Path(results_dir)
        results_dir.mkdir(parents=True, exist_ok=True)
        
        # Save summary data
        summary_data = {}
        for method_name, summary in benchmark_data['summaries'].items():
            summary_data[method_name] = {
                'method_name': summary.method_name,
                'method_type': summary.method_type,
                'n_tests': summary.n_tests,
                'mean_error': summary.mean_error,
                'std_error': summary.std_error,
                'mean_absolute_error': summary.mean_absolute_error,
                'root_mean_squared_error': summary.root_mean_squared_error,
                'correlation': summary.correlation,
                'mean_computation_time': summary.mean_computation_time,
                'success_rate': summary.success_rate
            }
        
        # Save JSON summary
        with open(results_dir / "benchmark_summary.json", 'w') as f:
            json.dump(summary_data, f, indent=2)
        
        # Save detailed results as CSV
        all_results = []
        for method_name, method_results in benchmark_data['all_results'].items():
            for result in method_results:
                all_results.append({
                    'method_name': result.method_name,
                    'method_type': result.method_type,
                    'hurst_estimate': result.hurst_estimate,
                    'true_hurst': result.metadata['true_hurst'],
                    'error': result.error,
                    'computation_time': result.computation_time,
                    'data_length': result.metadata['data_length'],
                    'generator': result.metadata['generator'],
                    'contamination': result.metadata['contamination'],
                    'biomedical_scenario': result.metadata['biomedical_scenario']
                })
        
        df = pd.DataFrame(all_results)
        df.to_csv(results_dir / "benchmark_results.csv", index=False)
        
        print(f"\nBenchmark results saved to: {results_dir}")

    

[docs]
    def create_visualizations(self, 
                            benchmark_data: Dict[str, Any],
                            save_path: Optional[Path] = None) -> None:
        """Create comprehensive visualizations of benchmark results."""
        print(f"\nCreating benchmark visualizations...")
        
        # Set up plotting
        plt.style.use('seaborn-v0_8')
        sns.set_palette("husl")
        
        # Create figure
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('Classical vs ML Methods: Comprehensive Benchmark', fontsize=16, fontweight='bold')
        
        summaries = benchmark_data['summaries']
        
        # 1. Performance comparison (MAE)
        ax1 = axes[0, 0]
        methods = list(summaries.keys())
        mae_values = [summaries[m].mean_absolute_error for m in methods]
        colors = ['red' if summaries[m].method_type == 'classical' else 'blue' for m in methods]
        
        bars = ax1.barh(methods, mae_values, color=colors, alpha=0.7)
        ax1.set_xlabel('Mean Absolute Error')
        ax1.set_title('Performance Comparison (MAE)')
        ax1.grid(True, alpha=0.3)
        
        # Add legend
        from matplotlib.patches import Patch
        legend_elements = [Patch(facecolor='red', alpha=0.7, label='Classical'),
                          Patch(facecolor='blue', alpha=0.7, label='ML')]
        ax1.legend(handles=legend_elements)
        
        # 2. Speed comparison
        ax2 = axes[0, 1]
        speed_values = [summaries[m].mean_computation_time * 1000 for m in methods]  # Convert to ms
        bars = ax2.barh(methods, speed_values, color=colors, alpha=0.7)
        ax2.set_xlabel('Computation Time (ms)')
        ax2.set_title('Speed Comparison')
        ax2.grid(True, alpha=0.3)
        
        # 3. Correlation comparison
        ax3 = axes[0, 2]
        corr_values = [summaries[m].correlation for m in methods]
        bars = ax3.barh(methods, corr_values, color=colors, alpha=0.7)
        ax3.set_xlabel('Correlation with True Values')
        ax3.set_title('Accuracy (Correlation)')
        ax3.grid(True, alpha=0.3)
        
        # 4. Error distribution
        ax4 = axes[1, 0]
        classical_errors = []
        ml_errors = []
        
        for method_name, summary in summaries.items():
            if summary.method_type == 'classical':
                classical_errors.extend([r.error for r in summary.results if r.error is not None])
            else:
                ml_errors.extend([r.error for r in summary.results if r.error is not None])
        
        if classical_errors and ml_errors:
            ax4.hist(classical_errors, bins=30, alpha=0.7, label='Classical', color='red', density=True)
            ax4.hist(ml_errors, bins=30, alpha=0.7, label='ML', color='blue', density=True)
            ax4.set_xlabel('Absolute Error')
            ax4.set_ylabel('Density')
            ax4.set_title('Error Distribution')
            ax4.legend()
            ax4.grid(True, alpha=0.3)
        
        # 5. Performance vs Speed scatter
        ax5 = axes[1, 1]
        mae_values = [summaries[m].mean_absolute_error for m in methods]
        speed_values = [summaries[m].mean_computation_time * 1000 for m in methods]
        
        for i, method in enumerate(methods):
            color = 'red' if summaries[method].method_type == 'classical' else 'blue'
            ax5.scatter(speed_values[i], mae_values[i], color=color, alpha=0.7, s=100)
            ax5.annotate(method, (speed_values[i], mae_values[i]), 
                        xytext=(5, 5), textcoords='offset points', fontsize=8)
        
        ax5.set_xlabel('Computation Time (ms)')
        ax5.set_ylabel('Mean Absolute Error')
        ax5.set_title('Performance vs Speed')
        ax5.grid(True, alpha=0.3)
        
        # 6. Success rate comparison
        ax6 = axes[1, 2]
        success_rates = [summaries[m].success_rate for m in methods]
        bars = ax6.barh(methods, success_rates, color=colors, alpha=0.7)
        ax6.set_xlabel('Success Rate')
        ax6.set_title('Reliability (Success Rate)')
        ax6.grid(True, alpha=0.3)
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Visualizations saved to: {save_path}")
        
        plt.show()





[docs]
def run_comprehensive_benchmark(pretrained_models_dir: Union[str, Path] = "pretrained_models",
                               results_dir: Union[str, Path] = "benchmark_results",
                               test_scenarios: Optional[List[TimeSeriesSample]] = None) -> Dict[str, Any]:
    """
    Run comprehensive benchmark comparison.
    
    Parameters:
    -----------
    pretrained_models_dir : str or Path
        Directory containing pretrained models
    results_dir : str or Path
        Directory to save results
    test_scenarios : List[TimeSeriesSample], optional
        Test scenarios to use
        
    Returns:
    --------
    Dict[str, Any]
        Complete benchmark results
    """
    # Create benchmark system
    benchmark = ClassicalMLBenchmark(pretrained_models_dir)
    
    # Run comprehensive benchmark
    results = benchmark.run_comprehensive_benchmark(
        test_scenarios=test_scenarios,
        save_results=True,
        results_dir=results_dir
    )
    
    # Create visualizations
    results_path = Path(results_dir)
    benchmark.create_visualizations(results, results_path / "benchmark_visualization.png")
    
    return results



if __name__ == "__main__":
    # Run comprehensive benchmark
    results = run_comprehensive_benchmark()
    print("\nBenchmark completed successfully!")