Skip to content

API Reference

Matching

Weighted exact matching using coarsened predictor variables

match(data, treatment)

Weight observations based on global and local (strata) treatment level populations

Only observations from strata with examples from each treatment level will receive a non-zero weight. If the treatment column contains continuous values, it is a high likelihood that all examples will receive a weight of zero.

Parameters:

Name Type Description Default
data DataFrame

The data on which we shall perform coarsened exact matching

required
treatment str

The name of the column in data containing the treatment variable

required

Returns:

Type Description
Series

The weight to use for each observation of the provided data given the coarsening provided

Source code in cem/match.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def match(data: pd.DataFrame, treatment: str) -> pd.Series:
    """
    Weight observations based on global and local (strata) treatment level populations

    Only observations from strata with examples from each treatment level will receive a non-zero weight.
    If the treatment column contains continuous values, it is a high likelihood that all examples will receive a weight of zero.

    Parameters
    ----------
    data : pandas.DataFrame
        The data on which we shall perform coarsened exact matching
    treatment : str
        The name of the column in data containing the treatment variable

    Returns
    -------
    pandas.Series
        The weight to use for each observation of the provided data given the coarsening provided
    """
    gb = list(data.drop(columns=treatment).columns)
    prematched_weights = pd.Series([0] * len(data), index=data.index)
    matched = data.groupby(gb).filter(lambda x: x[treatment].nunique() == data[treatment].nunique())

    if not len(matched):
        warnings.warn(
            "No strata had all levels of the treatment variable. All weights will be zero. This usually happens when a continuous variable (including the treatment variable) is not coarsened."
        )
        return prematched_weights
    global_level_counts = matched[treatment].value_counts()
    weights = pd.concat([_weight_stratum(stratum[treatment], global_level_counts) for _, stratum in matched.groupby(gb)])
    weights = weights.add(prematched_weights, fill_value=0)
    weights.name = "weights"
    return weights

Imbalance

Multidimensional histogram imbalance between two or more collections of observations

L1(data, treatment, weights=None)

(Weighted) Multidimensional L1 imbalance between groups of observations of differing treatment levels

Parameters:

Name Type Description Default
data DataFrame

Observations

required
treatment str

Name of column containing the treatment level

required
weights Series

Example weights

None
Source code in cem/imbalance.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def L1(data: pd.DataFrame, treatment: str, weights: Optional[pd.Series] = None) -> Union[pd.DataFrame, float]:
    """
    (Weighted) Multidimensional L1 imbalance between groups of observations of differing treatment levels

    Parameters
    ----------
    data : pandas.DataFrame
        Observations
    treatment : str
        Name of column containing the treatment level
    weights : pandas.Series
        Example weights
    """

    def func(tensor_a: np.ndarray, tensor_b: np.ndarray) -> float:
        return np.sum(np.abs(tensor_a / np.sum(tensor_a) - tensor_b / np.sum(tensor_b))) / 2

    return _L(data, treatment, func, weights)

L2(data, treatment, weights=None)

(Weighted) Multidimensional L2 imbalance between groups of observations of differing treatment levels

Parameters:

Name Type Description Default
data DataFrame

Observations

required
treatment str

Name of column containing the treatment level

required
weights Series

Example weights

None
Source code in cem/imbalance.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def L2(data: pd.DataFrame, treatment: str, weights: Optional[pd.Series] = None) -> Union[pd.DataFrame, float]:
    """
    (Weighted) Multidimensional L2 imbalance between groups of observations of differing treatment levels

    Parameters
    ----------
    data : pandas.DataFrame
        Observations
    treatment : str
        Name of column containing the treatment level
    weights : pandas.Series
        Example weights
    """

    def func(tensor_a: np.ndarray, tensor_b: np.ndarray) -> float:
        return np.sum(np.sqrt((tensor_a / np.sum(tensor_a) - tensor_b / np.sum(tensor_b)) ** 2)) / 2

    return _L(data, treatment, func, weights)

Automatic Coarsening

Coarsening predictor variables for a collection of observations

coarsen(data, treatment, measure='l1', lower=1, upper=10, columns=None)

Automatic coarsening by binning numeric columns using the number of bins, H, that resulted in the median (unweighted) imbalance over a range of possible values for H.

Parameters:

Name Type Description Default
data DataFrame

Data to coarsen

required
treatment str

Name of the column containing the treatment level

required
measure str

Imbalance measure (l1 or l2)

'l1'
lower int

Minimum value for H

1
upper int

Maximum value for H

10
columns Optional[Sequence[str]]

Columns to coarsen

None
Source code in cem/coarsen.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def coarsen(data: pd.DataFrame, treatment: str, measure: str = "l1", lower: int = 1, upper: int = 10, columns: Optional[Sequence[str]] = None) -> pd.DataFrame:
    """
    Automatic coarsening by binning numeric columns using the number of bins, H, that resulted in the median (unweighted) imbalance over a range of possible values for H.

    Parameters
    ----------
    data : pandas.DataFrame
        Data to coarsen
    treatment : str
        Name of the column containing the treatment level
    measure : str
        Imbalance measure (l1 or l2)
    lower : int
        Minimum value for H
    upper : int
        Maximum value for H
    columns :
        Columns to coarsen
    """
    df = data.copy()

    if columns is None:
        to_coarsen = set(c for c in df.columns if is_numeric_dtype(df[c]))
    else:
        to_coarsen = set(columns)

    if measure == "l1":
        L = L1
    elif measure == "l2":
        L = L2
    else:
        raise ValueError(f"Unknown imbalance measure '{measure}'")

    imb = {}
    for H in range(lower, upper + 1):
        df_coarse = df.apply(lambda x: pd.cut(x, bins=min(x.nunique(), H)) if x.name in to_coarsen else x)
        imb_h = L(df_coarse, treatment)
        if isinstance(imb_h, pd.DataFrame):
            # use the mean imbalance considering all treatment level pairs
            imb_h = imb_h["imbalance"].mean()
        imb[H] = {"imbalance": imb_h, "data": df_coarse}
    imb = pd.DataFrame.from_dict(imb, orient="index")
    H = (imb["imbalance"].sort_values(ascending=False) <= imb["imbalance"].quantile(0.5)).idxmax()
    print(imb.loc[H, "imbalance"])
    return imb.loc[H, "data"]