"""
TB Model Calibration Parameter Sweep for South Africa
This script performs a systematic parameter sweep to find optimal calibration parameters
that best match South Africa TB data including case notifications and age-stratified prevalence.
"""
import numpy as np
import pandas as pd
import datetime
import time
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from tb_calibration_south_africa import (
create_south_africa_data,
run_calibration_simulation,
compute_case_notifications,
compute_age_stratified_prevalence,
create_calibration_report
)
[docs]
def calculate_calibration_score(sim, sa_data):
"""
Calculate a composite calibration score based on multiple metrics
Args:
sim: Simulation object
sa_data: South Africa data dictionary
Returns:
dict: Calibration metrics and composite score
"""
# Compute model outputs
notifications = compute_case_notifications(sim)
age_prevalence = compute_age_stratified_prevalence(sim)
# Case notification fit
years = list(notifications.keys())
model_rates = np.array([notifications[year]['rate_per_100k'] for year in years])
data_rates = sa_data['case_notifications']['rate_per_100k'].values
notification_rmse = np.sqrt(np.mean((model_rates - data_rates)**2))
notification_mape = np.mean(np.abs((model_rates - data_rates) / data_rates)) * 100
# Age prevalence fit
age_groups = list(age_prevalence.keys())
model_age_prev = np.array([age_prevalence[group]['prevalence_per_100k'] for group in age_groups])
data_age_prev = sa_data['age_prevalence']['prevalence_per_100k'].values
age_prev_rmse = np.sqrt(np.mean((model_age_prev - data_age_prev)**2))
age_prev_mape = np.mean(np.abs((model_age_prev - data_age_prev) / data_age_prev)) * 100
# Overall prevalence fit
time_years = np.array([d.year for d in sim.results['timevec']])
active_prev = sim.results['tb']['prevalence_active']
target_idx = np.argmin(np.abs(time_years - 2018))
model_overall_prev = active_prev[target_idx] * 100
target_overall_prev = sa_data['targets']['overall_prevalence_2018']
overall_prev_error = abs(model_overall_prev - target_overall_prev)
# Composite score (lower is better)
# Weight different components based on importance
composite_score = (
0.4 * notification_mape + # Case notifications (40% weight)
0.4 * age_prev_mape + # Age prevalence (40% weight)
0.2 * (overall_prev_error * 100) # Overall prevalence (20% weight)
)
return {
'notification_rmse': notification_rmse,
'notification_mape': notification_mape,
'age_prev_rmse': age_prev_rmse,
'age_prev_mape': age_prev_mape,
'overall_prev_error': overall_prev_error,
'model_overall_prev': model_overall_prev,
'target_overall_prev': target_overall_prev,
'composite_score': composite_score
}
[docs]
def run_calibration_sweep(beta_range, rel_sus_range, tb_mortality_range,
n_agents=500, years=150, max_simulations=50):
"""
Run a systematic parameter sweep for calibration
Args:
beta_range: Array of beta values to test
rel_sus_range: Array of relative susceptibility values to test
tb_mortality_range: Array of TB mortality values to test
n_agents: Number of agents per simulation
years: Simulation duration
max_simulations: Maximum number of simulations to run
Returns:
dict: Sweep results with best parameters
"""
print("Starting TB Model Calibration Parameter Sweep...")
print(f"Parameter ranges:")
print(f" Beta: {beta_range}")
print(f" Rel Sus: {rel_sus_range}")
print(f" TB Mortality: {tb_mortality_range}")
print(f" Max simulations: {max_simulations}")
# Create South Africa data
sa_data = create_south_africa_data()
# Initialize results storage
results = []
best_score = float('inf')
best_params = None
best_sim = None
# Calculate total combinations
total_combinations = len(beta_range) * len(rel_sus_range) * len(tb_mortality_range)
simulations_run = 0
print(f"\nTotal parameter combinations: {total_combinations}")
print("Running simulations...")
start_time = time.time()
for beta in beta_range:
for rel_sus in rel_sus_range:
for tb_mortality in tb_mortality_range:
if simulations_run >= max_simulations:
print(f"\nReached maximum simulations ({max_simulations})")
break
simulations_run += 1
print(f"Simulation {simulations_run}/{min(total_combinations, max_simulations)}: "
f"β={beta:.4f}, rel_sus={rel_sus:.3f}, mort={tb_mortality:.1e}")
try:
# Run simulation
sim = run_calibration_simulation(
beta=beta,
rel_sus_latentslow=rel_sus,
tb_mortality=tb_mortality,
n_agents=n_agents,
years=years
)
# Calculate calibration score
score_metrics = calculate_calibration_score(sim, sa_data)
# Store results
result = {
'beta': beta,
'rel_sus_latentslow': rel_sus,
'tb_mortality': tb_mortality,
'simulation_number': simulations_run,
'score_metrics': score_metrics,
'composite_score': score_metrics['composite_score']
}
results.append(result)
# Update best if better
if score_metrics['composite_score'] < best_score:
best_score = score_metrics['composite_score']
best_params = (beta, rel_sus, tb_mortality)
best_sim = sim
print(f" ✓ New best score: {best_score:.2f}")
except Exception as e:
print(f" ✗ Simulation failed: {e}")
continue
if simulations_run >= max_simulations:
break
if simulations_run >= max_simulations:
break
end_time = time.time()
elapsed_time = end_time - start_time
# Create results summary
results_df = pd.DataFrame(results)
# Sort by composite score
results_df = results_df.sort_values('composite_score')
# Create sweep summary
sweep_summary = {
'timestamp': datetime.datetime.now().strftime("%Y_%m_%d_%H%M"),
'parameter_ranges': {
'beta_range': beta_range.tolist(),
'rel_sus_range': rel_sus_range.tolist(),
'tb_mortality_range': tb_mortality_range.tolist()
},
'simulation_settings': {
'n_agents': n_agents,
'years': years,
'max_simulations': max_simulations,
'simulations_run': simulations_run,
'elapsed_time_minutes': elapsed_time / 60
},
'best_parameters': {
'beta': best_params[0] if best_params else None,
'rel_sus_latentslow': best_params[1] if best_params else None,
'tb_mortality': best_params[2] if best_params else None,
'composite_score': best_score
},
'top_10_results': results_df.head(10).to_dict('records')
}
return sweep_summary, results_df, best_sim, sa_data
[docs]
def plot_sweep_results(results_df, timestamp):
"""
Create plots showing sweep results
Args:
results_df: DataFrame with sweep results
timestamp: Timestamp for file naming
"""
import matplotlib.pyplot as plt
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
# 1. Score distribution
ax1.hist(results_df['composite_score'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_xlabel('Composite Calibration Score')
ax1.set_ylabel('Number of Simulations')
ax1.set_title('Distribution of Calibration Scores')
ax1.grid(True, alpha=0.3)
# 2. Beta vs Score
ax2.scatter(results_df['beta'], results_df['composite_score'], alpha=0.6, s=30)
ax2.set_xlabel('Beta (Transmission Rate)')
ax2.set_ylabel('Composite Score')
ax2.set_title('Beta vs Calibration Score')
ax2.grid(True, alpha=0.3)
# 3. Rel Sus vs Score
ax3.scatter(results_df['rel_sus_latentslow'], results_df['composite_score'], alpha=0.6, s=30)
ax3.set_xlabel('Relative Susceptibility (Latent)')
ax3.set_ylabel('Composite Score')
ax3.set_title('Relative Susceptibility vs Calibration Score')
ax3.grid(True, alpha=0.3)
# 4. TB Mortality vs Score
ax4.scatter(results_df['tb_mortality'], results_df['composite_score'], alpha=0.6, s=30)
ax4.set_xlabel('TB Mortality Rate')
ax4.set_ylabel('Composite Score')
ax4.set_title('TB Mortality vs Calibration Score')
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.suptitle('TB Model Calibration Parameter Sweep Results', fontsize=16, y=1.02)
filename = f"calibration_sweep_results_{timestamp}.pdf"
plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.show()
return fig
[docs]
def main():
"""
Main function to run the calibration sweep
"""
print("=== TB Model Calibration Parameter Sweep ===")
# Define parameter ranges for sweep
# Start with broader ranges and can refine later
beta_range = np.array([0.010, 0.015, 0.020, 0.025, 0.030])
rel_sus_range = np.array([0.05, 0.10, 0.15, 0.20, 0.25])
tb_mortality_range = np.array([1e-4, 2e-4, 3e-4, 4e-4, 5e-4])
# Run sweep
sweep_summary, results_df, best_sim, sa_data = run_calibration_sweep(
beta_range=beta_range,
rel_sus_range=rel_sus_range,
tb_mortality_range=tb_mortality_range,
n_agents=500, # Smaller population for faster sweep
years=150, # Shorter simulation for faster sweep
max_simulations=50 # Limit total simulations
)
# Generate timestamp
timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H%M")
# Save results
results_df.to_csv(f"calibration_sweep_results_{timestamp}.csv", index=False)
# Save sweep summary
with open(f"calibration_sweep_summary_{timestamp}.json", 'w') as f:
json.dump(sweep_summary, f, indent=2, default=str)
# Create plots
plot_sweep_results(results_df, timestamp)
# Create detailed report for best simulation
if best_sim is not None:
print(f"\nCreating detailed report for best parameters...")
report = create_calibration_report(best_sim, sa_data, f"{timestamp}_best")
# Print best results
print(f"\n=== BEST CALIBRATION PARAMETERS ===")
print(f"Beta: {sweep_summary['best_parameters']['beta']:.4f}")
print(f"Relative Susceptibility: {sweep_summary['best_parameters']['rel_sus_latentslow']:.3f}")
print(f"TB Mortality: {sweep_summary['best_parameters']['tb_mortality']:.1e}")
print(f"Composite Score: {sweep_summary['best_parameters']['composite_score']:.2f}")
print(f"================================")
# Print top 5 results
print(f"\n=== TOP 5 CALIBRATION RESULTS ===")
for i, result in enumerate(sweep_summary['top_10_results'][:5]):
print(f"{i+1}. β={result['beta']:.4f}, rel_sus={result['rel_sus_latentslow']:.3f}, "
f"mort={result['tb_mortality']:.1e}, score={result['composite_score']:.2f}")
print(f"\nCalibration sweep completed!")
print(f"Files created with timestamp: {timestamp}")
return sweep_summary, results_df, best_sim
if __name__ == '__main__':
sweep_summary, results_df, best_sim = main()