API Reference

Matching

Weighted exact matching using coarsened predictor variables

`match(data, treatment)`

Weight observations based on global and local (strata) treatment level populations

Only observations from strata with examples from each treatment level will receive a non-zero weight. If the treatment column contains continuous values, it is a high likelihood that all examples will receive a weight of zero.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The data on which we shall perform coarsened exact matching	required
`treatment`	`str`	The name of the column in data containing the treatment variable	required

Returns:

Type	Description
`Series`	The weight to use for each observation of the provided data given the coarsening provided

Source code in cem/match.py

def match(data: pd.DataFrame, treatment: str) -> pd.Series:
    """
    Weight observations based on global and local (strata) treatment level populations

    Only observations from strata with examples from each treatment level will receive a non-zero weight.
    If the treatment column contains continuous values, it is a high likelihood that all examples will receive a weight of zero.

    Parameters
    ----------
    data : pandas.DataFrame
        The data on which we shall perform coarsened exact matching
    treatment : str
        The name of the column in data containing the treatment variable

    Returns
    -------
    pandas.Series
        The weight to use for each observation of the provided data given the coarsening provided
    """
    gb = list(data.drop(columns=treatment).columns)
    prematched_weights = pd.Series([0] * len(data), index=data.index)
    matched = data.groupby(gb).filter(lambda x: x[treatment].nunique() == data[treatment].nunique())

    if not len(matched):
        warnings.warn(
            "No strata had all levels of the treatment variable. All weights will be zero. This usually happens when a continuous variable (including the treatment variable) is not coarsened."
        )
        return prematched_weights
    global_level_counts = matched[treatment].value_counts()
    weights = pd.concat([_weight_stratum(stratum[treatment], global_level_counts) for _, stratum in matched.groupby(gb)])
    weights = weights.add(prematched_weights, fill_value=0)
    weights.name = "weights"
    return weights

Imbalance

Multidimensional histogram imbalance between two or more collections of observations

`L1(data, treatment, weights=None)`

(Weighted) Multidimensional L1 imbalance between groups of observations of differing treatment levels

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Observations	required
`treatment`	`str`	Name of column containing the treatment level	required
`weights`	`Series`	Example weights	`None`

Source code in cem/imbalance.py

def L1(data: pd.DataFrame, treatment: str, weights: Optional[pd.Series] = None) -> Union[pd.DataFrame, float]:
    """
    (Weighted) Multidimensional L1 imbalance between groups of observations of differing treatment levels

    Parameters
    ----------
    data : pandas.DataFrame
        Observations
    treatment : str
        Name of column containing the treatment level
    weights : pandas.Series
        Example weights
    """

    def func(tensor_a: np.ndarray, tensor_b: np.ndarray) -> float:
        return np.sum(np.abs(tensor_a / np.sum(tensor_a) - tensor_b / np.sum(tensor_b))) / 2

    return _L(data, treatment, func, weights)

`L2(data, treatment, weights=None)`

(Weighted) Multidimensional L2 imbalance between groups of observations of differing treatment levels

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Observations	required
`treatment`	`str`	Name of column containing the treatment level	required
`weights`	`Series`	Example weights	`None`

Source code in cem/imbalance.py

def L2(data: pd.DataFrame, treatment: str, weights: Optional[pd.Series] = None) -> Union[pd.DataFrame, float]:
    """
    (Weighted) Multidimensional L2 imbalance between groups of observations of differing treatment levels

    Parameters
    ----------
    data : pandas.DataFrame
        Observations
    treatment : str
        Name of column containing the treatment level
    weights : pandas.Series
        Example weights
    """

    def func(tensor_a: np.ndarray, tensor_b: np.ndarray) -> float:
        return np.sum(np.sqrt((tensor_a / np.sum(tensor_a) - tensor_b / np.sum(tensor_b)) ** 2)) / 2

    return _L(data, treatment, func, weights)

Automatic Coarsening

Coarsening predictor variables for a collection of observations

`coarsen(data, treatment, measure='l1', lower=1, upper=10, columns=None)`

Automatic coarsening by binning numeric columns using the number of bins, H, that resulted in the median (unweighted) imbalance over a range of possible values for H.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to coarsen	required
`treatment`	`str`	Name of the column containing the treatment level	required
`measure`	`str`	Imbalance measure (l1 or l2)	`'l1'`
`lower`	`int`	Minimum value for H	`1`
`upper`	`int`	Maximum value for H	`10`
`columns`	`Optional[Sequence[str]]`	Columns to coarsen	`None`

Source code in cem/coarsen.py

def coarsen(data: pd.DataFrame, treatment: str, measure: str = "l1", lower: int = 1, upper: int = 10, columns: Optional[Sequence[str]] = None) -> pd.DataFrame:
    """
    Automatic coarsening by binning numeric columns using the number of bins, H, that resulted in the median (unweighted) imbalance over a range of possible values for H.

    Parameters
    ----------
    data : pandas.DataFrame
        Data to coarsen
    treatment : str
        Name of the column containing the treatment level
    measure : str
        Imbalance measure (l1 or l2)
    lower : int
        Minimum value for H
    upper : int
        Maximum value for H
    columns :
        Columns to coarsen
    """
    df = data.copy()

    if columns is None:
        to_coarsen = set(c for c in df.columns if is_numeric_dtype(df[c]))
    else:
        to_coarsen = set(columns)

    if measure == "l1":
        L = L1
    elif measure == "l2":
        L = L2
    else:
        raise ValueError(f"Unknown imbalance measure '{measure}'")

    imb = {}
    for H in range(lower, upper + 1):
        df_coarse = df.apply(lambda x: pd.cut(x, bins=min(x.nunique(), H)) if x.name in to_coarsen else x)
        imb_h = L(df_coarse, treatment)
        if isinstance(imb_h, pd.DataFrame):
            # use the mean imbalance considering all treatment level pairs
            imb_h = imb_h["imbalance"].mean()
        imb[H] = {"imbalance": imb_h, "data": df_coarse}
    imb = pd.DataFrame.from_dict(imb, orient="index")
    H = (imb["imbalance"].sort_values(ascending=False) <= imb["imbalance"].quantile(0.5)).idxmax()
    print(imb.loc[H, "imbalance"])
    return imb.loc[H, "data"]