Source code for causalkit.design.mde

"""
Utility functions for calculating Minimum Detectable Effect (MDE) for experimental design.
"""

import numpy as np
from typing import Dict, Union, Tuple, Optional, List, Any


[docs] def calculate_mde( sample_size: Union[int, Tuple[int, int]], baseline_rate: Optional[float] = None, variance: Optional[Union[float, Tuple[float, float]]] = None, alpha: float = 0.05, power: float = 0.8, data_type: str = 'conversion', ratio: float = 0.5 ) -> Dict[str, Any]: """ Calculate the Minimum Detectable Effect (MDE) for conversion or continuous data. Parameters ---------- sample_size : int or tuple of int Total sample size or a tuple of (control_size, treatment_size). If a single integer is provided, the sample will be split according to the ratio parameter. baseline_rate : float, optional Baseline conversion rate (for conversion data) or baseline mean (for continuous data). Required for conversion data. variance : float or tuple of float, optional Variance of the data. For conversion data, this is calculated from the baseline rate if not provided. For continuous data, this parameter is required. Can be a single float (assumed same for both groups) or a tuple of (control_variance, treatment_variance). alpha : float, default 0.05 Significance level (Type I error rate). power : float, default 0.8 Statistical power (1 - Type II error rate). data_type : str, default 'conversion' Type of data. Either 'conversion' for binary/conversion data or 'continuous' for continuous data. ratio : float, default 0.5 Ratio of the sample allocated to the control group if sample_size is a single integer. Returns ------- Dict[str, Any] A dictionary containing: - 'mde': The minimum detectable effect (absolute) - 'mde_relative': The minimum detectable effect as a percentage of the baseline (relative) - 'parameters': The parameters used for the calculation Examples -------- >>> # Calculate MDE for conversion data with 1000 total sample size and 10% baseline conversion rate >>> calculate_mde(1000, baseline_rate=0.1, data_type='conversion') {'mde': 0.0527..., 'mde_relative': 0.5272..., 'parameters': {...}} >>> # Calculate MDE for continuous data with 500 samples in each group and variance of 4 >>> calculate_mde((500, 500), variance=4, data_type='continuous') {'mde': 0.3482..., 'mde_relative': None, 'parameters': {...}} Notes ----- For conversion data, the MDE is calculated using the formula: MDE = (z_α/2 + z_β) * sqrt((p1*(1-p1)/n1) + (p2*(1-p2)/n2)) For continuous data, the MDE is calculated using the formula: MDE = (z_α/2 + z_β) * sqrt((σ1²/n1) + (σ2²/n2)) where: - z_α/2 is the critical value for significance level α - z_β is the critical value for power - p1 and p2 are the conversion rates in the control and treatment groups - σ1² and σ2² are the variances in the control and treatment groups - n1 and n2 are the sample sizes in the control and treatment groups """ # Validate inputs if data_type not in ['conversion', 'continuous']: raise ValueError("data_type must be either 'conversion' or 'continuous'") if data_type == 'conversion' and baseline_rate is None: raise ValueError("baseline_rate is required for conversion data") if data_type == 'continuous' and variance is None: raise ValueError("variance is required for continuous data") # Calculate critical values z_alpha = abs(np.percentile(np.random.normal(0, 1, 100000), (1 - alpha / 2) * 100)) z_beta = abs(np.percentile(np.random.normal(0, 1, 100000), power * 100)) # Determine sample sizes for control and treatment groups if isinstance(sample_size, tuple): n_control, n_treatment = sample_size else: n_control = int(sample_size * ratio) n_treatment = sample_size - n_control # Calculate MDE based on data type if data_type == 'conversion': # For conversion data p_control = baseline_rate # If variance is not provided, calculate it from the baseline rate if variance is None: var_control = p_control * (1 - p_control) var_treatment = p_control * (1 - p_control) # Assuming same variance for treatment elif isinstance(variance, tuple): var_control, var_treatment = variance else: var_control = var_treatment = variance # Calculate MDE mde = (z_alpha + z_beta) * np.sqrt((var_control / n_control) + (var_treatment / n_treatment)) # Calculate relative MDE mde_relative = mde / baseline_rate if baseline_rate > 0 else None else: # data_type == 'continuous' # For continuous data if isinstance(variance, tuple): var_control, var_treatment = variance else: var_control = var_treatment = variance # Calculate MDE mde = (z_alpha + z_beta) * np.sqrt((var_control / n_control) + (var_treatment / n_treatment)) # Calculate relative MDE if baseline_rate is provided mde_relative = mde / baseline_rate if baseline_rate is not None and baseline_rate != 0 else None # Prepare the result dictionary result = { 'mde': mde, 'mde_relative': mde_relative, 'parameters': { 'sample_size': { 'total': n_control + n_treatment, 'control': n_control, 'treatment': n_treatment }, 'alpha': alpha, 'power': power, 'data_type': data_type } } # Add data-type specific parameters if data_type == 'conversion': result['parameters']['baseline_rate'] = baseline_rate else: # data_type == 'continuous' result['parameters']['variance'] = { 'control': var_control, 'treatment': var_treatment } if baseline_rate is not None: result['parameters']['baseline_mean'] = baseline_rate return result