Source code for causalkit.inference.ate.causalforestdml

"""
CausalForestDML implementation for estimating average treatment effects.

This module provides a function to estimate average treatment effects using EconML's CausalForestDML.
"""

import numpy as np
import pandas as pd
from typing import Dict, Any, Optional, Union, List, Tuple

from econml.dml import CausalForestDML
from catboost import CatBoostRegressor, CatBoostClassifier

from causalkit.data.causaldata import CausalData



[docs]
def causalforestdml(
    data: CausalData,
    model_y: Any = None,
    model_t: Any = None,
    n_estimators: int = 100,
    max_depth: Optional[int] = None,
    min_samples_leaf: int = 5,
    cv: int = 5,
    n_jobs: int = -1,
    random_state: Optional[int] = None,
    confidence_level: float = 0.95,
) -> Dict[str, Any]:
    """
    Estimate average treatment effects using EconML's CausalForestDML.
    
    Parameters
    ----------
    data : CausalData
        The causaldata object containing treatment, target, and confounders variables.
    model_y : estimator, optional
        The model for fitting the outcome variable. If None, a CatBoostRegressor configured to use all CPU cores is used.
    model_t : estimator, optional
        The model for fitting the treatment variable. If None, a CatBoostRegressor configured to use all CPU cores is used.
    n_estimators : int, default 100
        Number of trees in the forest.
    max_depth : int, optional
        Maximum depth of the trees. If None, nodes are expanded until all leaves are pure or
        contain less than min_samples_leaf samples.
    min_samples_leaf : int, default 5
        Minimum number of samples required to be at a leaf node.
    cv : int, default 5
        Number of folds for cross-fitting.
    n_jobs : int, default -1
        Number of jobs to run in parallel. -1 means using all processors.
    random_state : int, optional
        Controls the randomness of the estimator.
    confidence_level : float, default 0.95
        The confidence level for calculating confidence intervals (between 0 and 1).
        
    Returns
    -------
    Dict[str, Any]
        A dictionary containing:
        - coefficient: The estimated average treatment effect
        - std_error: The standard error of the estimate
        - p_value: The p-value for the null hypothesis that the effect is zero
        - confidence_interval: Tuple of (lower, upper) bounds for the confidence interval
        - model: The fitted CausalForestDML object
        
    Raises
    ------
    ValueError
        If the causaldata object doesn't have treatment, target, and confounders variables defined,
        or if the treatment variable is not binary.
        
    Examples
    --------
    >>> from causalkit.data import generate_rct_data
    >>> from causalkit.data import CausalData
    >>> from causalkit.inference.ate import causalforestdml
    >>> 
    >>> # Generate data
    >>> df = generate_rct_data()
    >>> 
    >>> # Create causaldata object
    >>> ck = CausalData(
    ...     df=df,
    ...     outcome='outcome',
    ...     treatment='treatment',
    ...     confounders=['age', 'invited_friend']
    ... )
    >>> 
    >>> # Estimate ATE using CausalForestDML
    >>> results = causalforestdml(ck)
    >>> print(f"ATE: {results['coefficient']:.4f}")
    >>> print(f"Standard Error: {results['std_error']:.4f}")
    >>> print(f"P-value: {results['p_value']:.4f}")
    >>> print(f"Confidence Interval: {results['confidence_interval']}")
    """
    # Validate inputs
    if data.treatment is None:
        raise ValueError("CausalData object must have a treatment variable defined")
    if data.target is None:
        raise ValueError("CausalData object must have a outcome variable defined")
    if data.confounders is None:
        raise ValueError("CausalData object must have confounders variables defined")
    
    # # Check if treatment is binary
    # unique_treatments = data.treatment.unique()
    # if len(unique_treatments) != 2:
    #     raise ValueError("Treatment variable must be binary (have exactly 2 unique values)")
    #
    # # Check if treatment values are 0 and 1
    # if not set(unique_treatments) == {0, 1}:
    #     raise ValueError("Treatment variable must have values 0 and 1")
    
    # Check confidence level
    if not 0 < confidence_level < 1:
        raise ValueError("confidence_level must be between 0 and 1 (exclusive)")
    
    # Set default ML models if not provided
    if model_y is None:
        model_y = CatBoostRegressor(iterations=100, depth=5, min_data_in_leaf=2, thread_count=-1, verbose=False, allow_writing_files=False)
    if model_t is None:
        # For binary treatments, a classifier is more appropriate
        model_t = CatBoostClassifier(iterations=100, depth=5, thread_count=-1, verbose=False, allow_writing_files=False)
    
    # Get data from CausalData object
    Y = data.target.values
    T = data.treatment.values
    conf_list = data.confounders
    if conf_list:
        X = data.get_df(include_treatment=False, include_target=False, include_confounders=True).values
    else:
        X = None
    
    # Create and fit CausalForestDML model
    model = CausalForestDML(
        model_y=model_y,
        model_t=model_t,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        cv=cv,
        discrete_treatment=True,
        random_state=random_state,
    )
    
    model.fit(Y, T, X=X)
    
    # Compute ATE and its confidence interval using EconML's built-in methods
    alpha = 1 - confidence_level
    ate = float(model.ate(X=X))
    ci_lower, ci_upper = model.ate_interval(X=X, alpha=alpha)
    ci_lower = float(ci_lower)
    ci_upper = float(ci_upper)
    
    # Derive standard error from CI width assuming normal approximation
    from scipy import stats
    z_score = stats.norm.ppf(1 - alpha/2)
    std_error = (ci_upper - ci_lower) / (2 * z_score) if z_score > 0 else 0.0
    
    # Compute two-sided p-value for H0: ate = 0
    z_value = abs(ate) / std_error if std_error > 0 else np.inf
    p_value = 2 * (1 - stats.norm.cdf(z_value)) if np.isfinite(z_value) else 0.0
    
    return {
        "coefficient": ate,
        "std_error": float(std_error),
        "p_value": float(p_value),
        "confidence_interval": (ci_lower, ci_upper),
        "model": model
    }