Source code for neurological_lrd_analysis.ml_baselines.inference

"""
Inference System for Pretrained ML Models.

This module provides a high-level interface for using pretrained models
for Hurst exponent estimation, including batch processing, ensemble predictions,
and uncertainty quantification.

Author: Davian R. Chin (PhD Candidate in Biomedical Engineering, University of Reading, UK)
"""

import numpy as np
from typing import Dict, List, Optional, Tuple, Union, Any
from dataclasses import dataclass
import warnings
from pathlib import Path

from .pretrained_models import PretrainedModelManager, ModelMetadata, ModelStatus
from .feature_extraction import TimeSeriesFeatureExtractor
from .ml_estimators import MLBaselineType, BaseMLEstimator


@dataclass
class PredictionResult:
    """Result from model prediction."""
    hurst_estimate: float
    confidence_interval: Optional[Tuple[float, float]] = None
    uncertainty: Optional[float] = None
    model_id: Optional[str] = None
    model_type: Optional[str] = None
    prediction_time: Optional[float] = None
    feature_contributions: Optional[Dict[str, float]] = None


@dataclass
class EnsembleResult:
    """Result from ensemble prediction."""
    mean_estimate: float
    std_estimate: float
    individual_predictions: List[float]
    model_weights: Optional[Dict[str, float]] = None
    confidence_interval: Optional[Tuple[float, float]] = None



[docs]
class PretrainedInference:
    """
    High-level interface for pretrained model inference.
    
    Provides easy-to-use methods for Hurst exponent estimation using
    pretrained ML models with support for single predictions, batch processing,
    and ensemble methods.
    """
    

[docs]
    def __init__(self, models_dir: Union[str, Path] = "pretrained_models"):
        """
        Initialize the inference system.
        
        Parameters:
        -----------
        models_dir : str or Path
            Directory containing pretrained models
        """
        self.manager = PretrainedModelManager(models_dir)
        self._feature_extractor = None
        self._model_cache = {}

    
    def _get_feature_extractor(self, model_metadata: ModelMetadata) -> TimeSeriesFeatureExtractor:
        """Get or create feature extractor with model-specific configuration."""
        config_key = str(model_metadata.feature_extractor_config)
        
        if config_key not in self._model_cache:
            self._model_cache[config_key] = TimeSeriesFeatureExtractor(
                **model_metadata.feature_extractor_config
            )
        
        return self._model_cache[config_key]
    

[docs]
    def predict_single(self, 
                      data: np.ndarray,
                      model_id: Optional[str] = None,
                      model_type: Optional[MLBaselineType] = None,
                      use_best: bool = True) -> PredictionResult:
        """
        Predict Hurst exponent for a single time series.
        
        Parameters:
        -----------
        data : np.ndarray
            Time series data
        model_id : str, optional
            Specific model ID to use
        model_type : MLBaselineType, optional
            Type of model to use (will select best if multiple available)
        use_best : bool
            Whether to use the best performing model if model_id not specified
            
        Returns:
        --------
        PredictionResult
            Prediction result with metadata
        """
        import time
        start_time = time.time()
        
        # Select model
        if model_id is not None:
            estimator, metadata = self.manager.load_model(model_id)
        elif model_type is not None:
            if use_best:
                estimator, metadata = self.manager.get_best_model(model_type)
            else:
                # Get first available model of this type
                models = self.manager.list_models(model_type=model_type, status=ModelStatus.TRAINED)
                if not models:
                    raise ValueError(f"No trained models of type {model_type.value} found")
                estimator, metadata = self.manager.load_model(models[0].model_id)
        else:
            # Use best overall model
            all_models = self.manager.list_models(status=ModelStatus.TRAINED)
            if not all_models:
                raise ValueError("No trained models available")
            
            # Find best model by validation score
            best_model = max(all_models, key=lambda x: x.performance_metrics.get('validation_score', -np.inf))
            estimator, metadata = self.manager.load_model(best_model.model_id)
        
        # Extract features
        feature_extractor = self._get_feature_extractor(metadata)
        features = feature_extractor.extract_features(data)
        
        # Make prediction
        prediction = estimator.predict(features.combined.reshape(1, -1))
        
        prediction_time = time.time() - start_time
        
        return PredictionResult(
            hurst_estimate=prediction.hurst_estimate,
            confidence_interval=prediction.confidence_interval,
            uncertainty=prediction.prediction_uncertainty,
            model_id=metadata.model_id,
            model_type=metadata.model_type.value,
            prediction_time=prediction_time,
            feature_contributions=prediction.feature_contributions
        )

    

[docs]
    def predict_batch(self, 
                     data_list: List[np.ndarray],
                     model_id: Optional[str] = None,
                     model_type: Optional[MLBaselineType] = None,
                     use_best: bool = True,
                     show_progress: bool = True) -> List[PredictionResult]:
        """
        Predict Hurst exponents for multiple time series.
        
        Parameters:
        -----------
        data_list : List[np.ndarray]
            List of time series data
        model_id : str, optional
            Specific model ID to use
        model_type : MLBaselineType, optional
            Type of model to use
        use_best : bool
            Whether to use the best performing model
        show_progress : bool
            Whether to show progress during batch processing
            
        Returns:
        --------
        List[PredictionResult]
            List of prediction results
        """
        results = []
        
        for i, data in enumerate(data_list):
            if show_progress and i % 10 == 0:
                print(f"Processing {i+1}/{len(data_list)} time series...")
            
            try:
                result = self.predict_single(data, model_id, model_type, use_best)
                results.append(result)
            except Exception as e:
                warnings.warn(f"Failed to predict for time series {i}: {e}")
                # Create a failed result
                results.append(PredictionResult(
                    hurst_estimate=np.nan,
                    model_id=model_id,
                    model_type=model_type.value if model_type else None
                ))
        
        if show_progress:
            print(f"Completed batch prediction: {len(results)} results")
        
        return results

    

[docs]
    def predict_ensemble(self, 
                        data: np.ndarray,
                        model_types: Optional[List[MLBaselineType]] = None,
                        weights: Optional[Dict[str, float]] = None,
                        include_uncertainty: bool = True) -> EnsembleResult:
        """
        Predict using ensemble of models.
        
        Parameters:
        -----------
        data : np.ndarray
            Time series data
        model_types : List[MLBaselineType], optional
            Types of models to include in ensemble
        weights : Dict[str, float], optional
            Weights for each model type
        include_uncertainty : bool
            Whether to include uncertainty quantification
            
        Returns:
        --------
        EnsembleResult
            Ensemble prediction result
        """
        if model_types is None:
            model_types = list(MLBaselineType)
        
        # Get available models
        available_models = []
        for model_type in model_types:
            models = self.manager.list_models(model_type=model_type, status=ModelStatus.TRAINED)
            if models:
                available_models.extend(models)
        
        if not available_models:
            raise ValueError("No trained models available for ensemble")
        
        # Make predictions with all models
        predictions = []
        model_weights = {}
        
        for model in available_models:
            try:
                result = self.predict_single(data, model_id=model.model_id)
                predictions.append(result.hurst_estimate)
                
                # Set weight based on model performance
                if weights and model.model_type.value in weights:
                    model_weights[model.model_id] = weights[model.model_type.value]
                else:
                    # Use validation score as weight
                    model_weights[model.model_id] = model.performance_metrics.get('validation_score', 0.0)
                
            except Exception as e:
                warnings.warn(f"Failed to predict with model {model.model_id}: {e}")
                continue
        
        if not predictions:
            raise ValueError("No successful predictions for ensemble")
        
        predictions = np.array(predictions)
        weights_array = np.array([model_weights.get(model.model_id, 1.0) for model in available_models[:len(predictions)]])
        
        # Normalize weights
        if np.sum(weights_array) > 0:
            weights_array = weights_array / np.sum(weights_array)
        else:
            weights_array = np.ones(len(predictions)) / len(predictions)
        
        # Calculate ensemble statistics
        mean_estimate = np.average(predictions, weights=weights_array)
        std_estimate = np.sqrt(np.average((predictions - mean_estimate)**2, weights=weights_array))
        
        # Calculate confidence interval
        confidence_interval = None
        if include_uncertainty and len(predictions) > 1:
            # Use t-distribution for confidence interval
            from scipy import stats
            alpha = 0.05
            n = len(predictions)
            t_val = stats.t.ppf(1 - alpha/2, n-1)
            margin = t_val * std_estimate / np.sqrt(n)
            confidence_interval = (mean_estimate - margin, mean_estimate + margin)
        
        return EnsembleResult(
            mean_estimate=mean_estimate,
            std_estimate=std_estimate,
            individual_predictions=predictions.tolist(),
            model_weights=model_weights,
            confidence_interval=confidence_interval
        )

    

[docs]
    def compare_models(self, 
                      data: np.ndarray,
                      model_types: Optional[List[MLBaselineType]] = None) -> Dict[str, PredictionResult]:
        """
        Compare predictions from different model types.
        
        Parameters:
        -----------
        data : np.ndarray
            Time series data
        model_types : List[MLBaselineType], optional
            Types of models to compare
            
        Returns:
        --------
        Dict[str, PredictionResult]
            Predictions from each model type
        """
        if model_types is None:
            model_types = list(MLBaselineType)
        
        results = {}
        
        for model_type in model_types:
            try:
                result = self.predict_single(data, model_type=model_type, use_best=True)
                results[model_type.value] = result
            except Exception as e:
                warnings.warn(f"Failed to predict with {model_type.value}: {e}")
                results[model_type.value] = PredictionResult(
                    hurst_estimate=np.nan,
                    model_type=model_type.value
                )
        
        return results

    

[docs]
    def get_model_info(self, model_id: Optional[str] = None) -> Union[ModelMetadata, List[ModelMetadata]]:
        """
        Get information about available models.
        
        Parameters:
        -----------
        model_id : str, optional
            Specific model ID, or None for all models
            
        Returns:
        --------
        Union[ModelMetadata, List[ModelMetadata]]
            Model metadata
        """
        if model_id is not None:
            if model_id not in self.manager._metadata_registry:
                raise ValueError(f"Model {model_id} not found")
            return self.manager._metadata_registry[model_id]
        else:
            return self.manager.list_models(status=ModelStatus.TRAINED)

    

[docs]
    def benchmark_models(self, 
                        test_data: List[np.ndarray],
                        true_hurst: List[float],
                        model_types: Optional[List[MLBaselineType]] = None) -> Dict[str, Dict[str, float]]:
        """
        Benchmark model performance on test data.
        
        Parameters:
        -----------
        test_data : List[np.ndarray]
            Test time series data
        true_hurst : List[float]
            True Hurst exponents
        model_types : List[MLBaselineType], optional
            Types of models to benchmark
            
        Returns:
        --------
        Dict[str, Dict[str, float]]
            Performance metrics for each model type
        """
        if model_types is None:
            model_types = list(MLBaselineType)
        
        results = {}
        
        for model_type in model_types:
            try:
                # Get predictions
                predictions = []
                for data in test_data:
                    result = self.predict_single(data, model_type=model_type, use_best=True)
                    predictions.append(result.hurst_estimate)
                
                predictions = np.array(predictions)
                true_values = np.array(true_hurst)
                
                # Calculate metrics
                mse = np.mean((predictions - true_values) ** 2)
                mae = np.mean(np.abs(predictions - true_values))
                r2 = 1 - mse / np.var(true_values)
                correlation = np.corrcoef(predictions, true_values)[0, 1]
                
                results[model_type.value] = {
                    'mse': float(mse),
                    'mae': float(mae),
                    'r2': float(r2),
                    'correlation': float(correlation),
                    'rmse': float(np.sqrt(mse))
                }
                
            except Exception as e:
                warnings.warn(f"Failed to benchmark {model_type.value}: {e}")
                results[model_type.value] = {
                    'mse': np.nan,
                    'mae': np.nan,
                    'r2': np.nan,
                    'correlation': np.nan,
                    'rmse': np.nan
                }
        
        return results





[docs]
def quick_predict(data: np.ndarray, 
                 models_dir: Union[str, Path] = "pretrained_models",
                 model_type: Optional[MLBaselineType] = None) -> float:
    """
    Quick prediction function for single time series.
    
    Parameters:
    -----------
    data : np.ndarray
        Time series data
    models_dir : str or Path
        Directory containing pretrained models
    model_type : MLBaselineType, optional
        Type of model to use
        
    Returns:
    --------
    float
        Predicted Hurst exponent
    """
    inference = PretrainedInference(models_dir)
    result = inference.predict_single(data, model_type=model_type)
    return result.hurst_estimate




[docs]
def quick_ensemble_predict(data: np.ndarray,
                          models_dir: Union[str, Path] = "pretrained_models",
                          model_types: Optional[List[MLBaselineType]] = None) -> Tuple[float, float]:
    """
    Quick ensemble prediction function.
    
    Parameters:
    -----------
    data : np.ndarray
        Time series data
    models_dir : str or Path
        Directory containing pretrained models
    model_types : List[MLBaselineType], optional
        Types of models to include in ensemble
        
    Returns:
    --------
    Tuple[float, float]
        (mean_estimate, std_estimate)
    """
    inference = PretrainedInference(models_dir)
    result = inference.predict_ensemble(data, model_types=model_types)
    return result.mean_estimate, result.std_estimate