Source code for scripts.calibration.tb_calibration_sweep

"""
TB Model Calibration Parameter Sweep for South Africa

This script performs a systematic parameter sweep to find optimal calibration parameters
that best match South Africa TB data including case notifications and age-stratified prevalence.
"""

import numpy as np
import pandas as pd
import datetime
import time
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from tb_calibration_south_africa import (
    create_south_africa_data, 
    run_calibration_simulation,
    compute_case_notifications,
    compute_age_stratified_prevalence,
    create_calibration_report
)


[docs] def calculate_calibration_score(sim, sa_data): """ Calculate a composite calibration score based on multiple metrics Args: sim: Simulation object sa_data: South Africa data dictionary Returns: dict: Calibration metrics and composite score """ # Compute model outputs notifications = compute_case_notifications(sim) age_prevalence = compute_age_stratified_prevalence(sim) # Case notification fit years = list(notifications.keys()) model_rates = np.array([notifications[year]['rate_per_100k'] for year in years]) data_rates = sa_data['case_notifications']['rate_per_100k'].values notification_rmse = np.sqrt(np.mean((model_rates - data_rates)**2)) notification_mape = np.mean(np.abs((model_rates - data_rates) / data_rates)) * 100 # Age prevalence fit age_groups = list(age_prevalence.keys()) model_age_prev = np.array([age_prevalence[group]['prevalence_per_100k'] for group in age_groups]) data_age_prev = sa_data['age_prevalence']['prevalence_per_100k'].values age_prev_rmse = np.sqrt(np.mean((model_age_prev - data_age_prev)**2)) age_prev_mape = np.mean(np.abs((model_age_prev - data_age_prev) / data_age_prev)) * 100 # Overall prevalence fit time_years = np.array([d.year for d in sim.results['timevec']]) active_prev = sim.results['tb']['prevalence_active'] target_idx = np.argmin(np.abs(time_years - 2018)) model_overall_prev = active_prev[target_idx] * 100 target_overall_prev = sa_data['targets']['overall_prevalence_2018'] overall_prev_error = abs(model_overall_prev - target_overall_prev) # Composite score (lower is better) # Weight different components based on importance composite_score = ( 0.4 * notification_mape + # Case notifications (40% weight) 0.4 * age_prev_mape + # Age prevalence (40% weight) 0.2 * (overall_prev_error * 100) # Overall prevalence (20% weight) ) return { 'notification_rmse': notification_rmse, 'notification_mape': notification_mape, 'age_prev_rmse': age_prev_rmse, 'age_prev_mape': age_prev_mape, 'overall_prev_error': overall_prev_error, 'model_overall_prev': model_overall_prev, 'target_overall_prev': target_overall_prev, 'composite_score': composite_score }
[docs] def run_calibration_sweep(beta_range, rel_sus_range, tb_mortality_range, n_agents=500, years=150, max_simulations=50): """ Run a systematic parameter sweep for calibration Args: beta_range: Array of beta values to test rel_sus_range: Array of relative susceptibility values to test tb_mortality_range: Array of TB mortality values to test n_agents: Number of agents per simulation years: Simulation duration max_simulations: Maximum number of simulations to run Returns: dict: Sweep results with best parameters """ print("Starting TB Model Calibration Parameter Sweep...") print(f"Parameter ranges:") print(f" Beta: {beta_range}") print(f" Rel Sus: {rel_sus_range}") print(f" TB Mortality: {tb_mortality_range}") print(f" Max simulations: {max_simulations}") # Create South Africa data sa_data = create_south_africa_data() # Initialize results storage results = [] best_score = float('inf') best_params = None best_sim = None # Calculate total combinations total_combinations = len(beta_range) * len(rel_sus_range) * len(tb_mortality_range) simulations_run = 0 print(f"\nTotal parameter combinations: {total_combinations}") print("Running simulations...") start_time = time.time() for beta in beta_range: for rel_sus in rel_sus_range: for tb_mortality in tb_mortality_range: if simulations_run >= max_simulations: print(f"\nReached maximum simulations ({max_simulations})") break simulations_run += 1 print(f"Simulation {simulations_run}/{min(total_combinations, max_simulations)}: " f"β={beta:.4f}, rel_sus={rel_sus:.3f}, mort={tb_mortality:.1e}") try: # Run simulation sim = run_calibration_simulation( beta=beta, rel_sus_latentslow=rel_sus, tb_mortality=tb_mortality, n_agents=n_agents, years=years ) # Calculate calibration score score_metrics = calculate_calibration_score(sim, sa_data) # Store results result = { 'beta': beta, 'rel_sus_latentslow': rel_sus, 'tb_mortality': tb_mortality, 'simulation_number': simulations_run, 'score_metrics': score_metrics, 'composite_score': score_metrics['composite_score'] } results.append(result) # Update best if better if score_metrics['composite_score'] < best_score: best_score = score_metrics['composite_score'] best_params = (beta, rel_sus, tb_mortality) best_sim = sim print(f" ✓ New best score: {best_score:.2f}") except Exception as e: print(f" ✗ Simulation failed: {e}") continue if simulations_run >= max_simulations: break if simulations_run >= max_simulations: break end_time = time.time() elapsed_time = end_time - start_time # Create results summary results_df = pd.DataFrame(results) # Sort by composite score results_df = results_df.sort_values('composite_score') # Create sweep summary sweep_summary = { 'timestamp': datetime.datetime.now().strftime("%Y_%m_%d_%H%M"), 'parameter_ranges': { 'beta_range': beta_range.tolist(), 'rel_sus_range': rel_sus_range.tolist(), 'tb_mortality_range': tb_mortality_range.tolist() }, 'simulation_settings': { 'n_agents': n_agents, 'years': years, 'max_simulations': max_simulations, 'simulations_run': simulations_run, 'elapsed_time_minutes': elapsed_time / 60 }, 'best_parameters': { 'beta': best_params[0] if best_params else None, 'rel_sus_latentslow': best_params[1] if best_params else None, 'tb_mortality': best_params[2] if best_params else None, 'composite_score': best_score }, 'top_10_results': results_df.head(10).to_dict('records') } return sweep_summary, results_df, best_sim, sa_data
[docs] def plot_sweep_results(results_df, timestamp): """ Create plots showing sweep results Args: results_df: DataFrame with sweep results timestamp: Timestamp for file naming """ import matplotlib.pyplot as plt fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12)) # 1. Score distribution ax1.hist(results_df['composite_score'], bins=20, alpha=0.7, color='skyblue', edgecolor='black') ax1.set_xlabel('Composite Calibration Score') ax1.set_ylabel('Number of Simulations') ax1.set_title('Distribution of Calibration Scores') ax1.grid(True, alpha=0.3) # 2. Beta vs Score ax2.scatter(results_df['beta'], results_df['composite_score'], alpha=0.6, s=30) ax2.set_xlabel('Beta (Transmission Rate)') ax2.set_ylabel('Composite Score') ax2.set_title('Beta vs Calibration Score') ax2.grid(True, alpha=0.3) # 3. Rel Sus vs Score ax3.scatter(results_df['rel_sus_latentslow'], results_df['composite_score'], alpha=0.6, s=30) ax3.set_xlabel('Relative Susceptibility (Latent)') ax3.set_ylabel('Composite Score') ax3.set_title('Relative Susceptibility vs Calibration Score') ax3.grid(True, alpha=0.3) # 4. TB Mortality vs Score ax4.scatter(results_df['tb_mortality'], results_df['composite_score'], alpha=0.6, s=30) ax4.set_xlabel('TB Mortality Rate') ax4.set_ylabel('Composite Score') ax4.set_title('TB Mortality vs Calibration Score') ax4.grid(True, alpha=0.3) plt.tight_layout() plt.suptitle('TB Model Calibration Parameter Sweep Results', fontsize=16, y=1.02) filename = f"calibration_sweep_results_{timestamp}.pdf" plt.savefig(filename, dpi=300, bbox_inches='tight') plt.show() return fig
[docs] def main(): """ Main function to run the calibration sweep """ print("=== TB Model Calibration Parameter Sweep ===") # Define parameter ranges for sweep # Start with broader ranges and can refine later beta_range = np.array([0.010, 0.015, 0.020, 0.025, 0.030]) rel_sus_range = np.array([0.05, 0.10, 0.15, 0.20, 0.25]) tb_mortality_range = np.array([1e-4, 2e-4, 3e-4, 4e-4, 5e-4]) # Run sweep sweep_summary, results_df, best_sim, sa_data = run_calibration_sweep( beta_range=beta_range, rel_sus_range=rel_sus_range, tb_mortality_range=tb_mortality_range, n_agents=500, # Smaller population for faster sweep years=150, # Shorter simulation for faster sweep max_simulations=50 # Limit total simulations ) # Generate timestamp timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H%M") # Save results results_df.to_csv(f"calibration_sweep_results_{timestamp}.csv", index=False) # Save sweep summary with open(f"calibration_sweep_summary_{timestamp}.json", 'w') as f: json.dump(sweep_summary, f, indent=2, default=str) # Create plots plot_sweep_results(results_df, timestamp) # Create detailed report for best simulation if best_sim is not None: print(f"\nCreating detailed report for best parameters...") report = create_calibration_report(best_sim, sa_data, f"{timestamp}_best") # Print best results print(f"\n=== BEST CALIBRATION PARAMETERS ===") print(f"Beta: {sweep_summary['best_parameters']['beta']:.4f}") print(f"Relative Susceptibility: {sweep_summary['best_parameters']['rel_sus_latentslow']:.3f}") print(f"TB Mortality: {sweep_summary['best_parameters']['tb_mortality']:.1e}") print(f"Composite Score: {sweep_summary['best_parameters']['composite_score']:.2f}") print(f"================================") # Print top 5 results print(f"\n=== TOP 5 CALIBRATION RESULTS ===") for i, result in enumerate(sweep_summary['top_10_results'][:5]): print(f"{i+1}. β={result['beta']:.4f}, rel_sus={result['rel_sus_latentslow']:.3f}, " f"mort={result['tb_mortality']:.1e}, score={result['composite_score']:.2f}") print(f"\nCalibration sweep completed!") print(f"Files created with timestamp: {timestamp}") return sweep_summary, results_df, best_sim
if __name__ == '__main__': sweep_summary, results_df, best_sim = main()