Source code for causalkit.inference.att.conversion_z_test

"""
Two-proportion z-test for conversion data in CausalData (ATT context).

Compares conversion rates between treated (T=1) and control (T=0) groups.
Returns p-value, absolute/relative differences, and their confidence intervals
(similar structure to inference.att.ttest).
"""

from typing import Dict, Any

import numpy as np
import pandas as pd
from scipy import stats

from causalkit.data.causaldata import CausalData


[docs] def conversion_z_test(data: CausalData, confidence_level: float = 0.95) -> Dict[str, Any]: """ Perform a two-proportion z-test on a CausalData object with a binary outcome (conversion). Parameters ---------- data : CausalData The CausalData object containing treatment and outcome variables. confidence_level : float, default 0.95 The confidence level for calculating confidence intervals (between 0 and 1). Returns ------- Dict[str, Any] A dictionary containing: - p_value: Two-sided p-value from the z-test - absolute_difference: Difference in conversion rates (treated - control) - absolute_ci: Tuple (lower, upper) for the absolute difference CI - relative_difference: Percentage change relative to control rate - relative_ci: Tuple (lower, upper) for the relative difference CI Raises ------ ValueError If treatment/outcome are missing, treatment is not binary, outcome is not binary, groups are empty, or confidence_level is outside (0, 1). """ # Basic validation treatment_var = data.treatment target_var = data.target if not isinstance(treatment_var, pd.Series) or treatment_var.empty: raise ValueError("causaldata object must have a treatment variable defined") if not isinstance(target_var, pd.Series) or target_var.empty: raise ValueError("causaldata object must have a outcome variable defined") # Treatment must be binary 0/1 tr_unique = treatment_var.unique() if len(tr_unique) != 2: raise ValueError("Treatment variable must be binary (have exactly 2 unique values)") # Target must be binary 0/1 for conversion test tg_unique = set(pd.Series(target_var.unique()).dropna().tolist()) if not tg_unique.issubset({0, 1}): raise ValueError("Target must be binary (0/1) for conversion_z_test") # Build groups using 0/1 coding on treatment control_mask = treatment_var == 0 treat_mask = treatment_var == 1 control = target_var[control_mask] treat = target_var[treat_mask] n0 = int(control.shape[0]) n1 = int(treat.shape[0]) if n0 < 1 or n1 < 1: raise ValueError("Not enough observations in one of the groups for z-test (need at least 1 per group)") # Counts of conversions (assumes 0/1 coding for outcome) x0 = float(control.sum()) x1 = float(treat.sum()) p0 = x0 / n0 p1 = x1 / n1 # Two-proportion z-test (two-sided p-value) using pooled SE under H0 for test statistic p_pool = (x0 + x1) / (n0 + n1) se_pooled = float(np.sqrt(p_pool * (1 - p_pool) * (1 / n0 + 1 / n1))) # Guard against zero standard error (e.g., no variance case) if se_pooled == 0: z_stat = 0.0 p_value = 1.0 else: z_stat = float((p1 - p0) / se_pooled) p_value = float(2 * (1 - stats.norm.cdf(abs(z_stat)))) # Absolute difference and CI using unpooled (Wald) SE absolute_diff = float(p1 - p0) if not 0 < confidence_level < 1: raise ValueError("confidence_level must be between 0 and 1 (exclusive)") alpha = 1 - confidence_level z_crit = float(stats.norm.ppf(1 - alpha / 2)) se_unpooled = float(np.sqrt(p1 * (1 - p1) / n1 + p0 * (1 - p0) / n0)) margin = z_crit * se_unpooled absolute_ci = (absolute_diff - margin, absolute_diff + margin) # Relative difference (%) and CI via scaling by control rate if p0 == 0: relative_diff = np.inf if absolute_diff > 0 else -np.inf if absolute_diff < 0 else 0.0 relative_ci = (np.nan, np.nan) else: relative_diff = (absolute_diff / abs(p0)) * 100.0 relative_margin = (margin / abs(p0)) * 100.0 relative_ci = (relative_diff - relative_margin, relative_diff + relative_margin) return { "p_value": float(p_value), "absolute_difference": float(absolute_diff), "absolute_ci": (float(absolute_ci[0]), float(absolute_ci[1])), "relative_difference": float(relative_diff), "relative_ci": (float(relative_ci[0]), float(relative_ci[1])), }