From 0cabbb02a2b48617110bac0738a2c2b9eeb144dc Mon Sep 17 00:00:00 2001 From: Francesco Pisu Date: Sun, 17 Sep 2023 09:40:11 +0200 Subject: [PATCH 1/2] docs: update README with graphical examples #10 --- README.md | 351 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 240 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index c5d42ef..1a0cb90 100644 --- a/README.md +++ b/README.md @@ -4,17 +4,22 @@ > Better insights into Machine Learning models performance. -Modelsight is a collection of functions that create publication-ready graphics for the visual evaluation of binary classifiers adhering to the scikit-learn interface. +Modelsight is a collection of functions that create publication-ready graphics for the visual evaluation of cross-validated binary classifiers adhering to the scikit-learn interface. -Modelsight is strongly oriented towards the evaluation of already fitted `ExplainableBoostingClassifier` of the [interpretml](https://github.com/interpretml/interpret) package. +While the majority of `modelsight` functionalities are compatible with any binary classifier that follows scikit-learn's interface, it's worth noting that, as of now, some features are exclusive to ExplainableBoostingClassifier from the interpretml package. | Overview | | |---|---| +| **Open Source** | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) | +| **CI/CD** | [![github-actions](https://img.shields.io/github/actions/workflow/status/francescopisu/modelsight/ci-cd.yml?logo=github)](https://github.com/francescopisu/modelsight/actions/workflows/ci-cd.yml) [![!codecov](https://img.shields.io/codecov/c/github/francescopisu/modelsight?label=codecov&logo=codecov)](https://codecov.io/gh/francescopisu/modelsight) [![readthedocs](https://img.shields.io/readthedocs/modelsight?logo=readthedocs)](https://modelsight.readthedocs.io/en/latest/) | | **Code** | [![!pypi](https://img.shields.io/pypi/v/modelsight?color=orange)](https://pypi.org/project/modelsight/) [![!python-versions](https://img.shields.io/pypi/pyversions/modelsight)](https://www.python.org/) | ## 💫 Features -Our goal is to streamline the visual assessment of binary classifiers by creating a set of functions designed to generate publication-ready plots. +Our objective is to enhance the visual assessment of cross-validated binary classifiers by introducing a set of functions tailored for generating publication-ready plots. +To effectively utilize modelsight, you should provide a dictionary where the keys represent model names, and the values consist of CVModellingOutput objects. CVModellingOutput is a custom data class designed to encapsulate the results of a cross-validation process for a specific binary classifier. For detailed information about CVModellingOutput, please refer to the [API reference](https://modelsight.readthedocs.io/en/latest/autoapi/modelsight/_typing/index.html#modelsight._typing.CVModellingOutput). + +Before using the functionalities offered by this library, ensure that your cross-validation data adheres to this structure. | Module | Status | Links | |---|---|---| @@ -38,156 +43,280 @@ $ pip install modelsight ``` ## :zap: Quickstart +The following code will cross-validate five different binary classifiers and produce a dictionary of CVModellingOutput. + ```python +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer +from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold +from sklearn.metrics import roc_curve +from sklearn.svm import SVC +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.linear_model import LogisticRegression from interpret.glassbox import ExplainableBoostingClassifier -from utils import ( +from tests.utils import ( select_features, get_feature_selector, - get_model, get_calibrated_model ) -from modelsight.curves import average_roc_curves -from modelsight.config import settings from modelsight._typing import CVModellingOutput, Estimator +# Load data X, y = load_breast_cancer(return_X_y=True, as_frame=True) -outer_cv = RepeatedStratifiedKFold(n_repeats=cv_config.get("N_REPEATS"), - n_splits=cv_config.get("N_SPLITS"), - random_state=settings.misc.seed) -inner_cv = StratifiedKFold(n_splits=cv_config.get("N_SPLITS"), - shuffle=cv_config.get("SHUFFLE"), - random_state=settings.misc.seed) - -gts_train = [] -gts_val = [] -probas_train = [] -probas_val = [] -gts_train_conc = [] -gts_val_conc = [] -probas_train_conc = [] -probas_val_conc = [] - -models = [] -errors = [] -correct = [] -features = [] - -ts = datetime.timestamp(datetime.now()) +# Define factory for binary classifiers +class BinaryModelFactory: + def get_model(self, model_name: str) -> Estimator: + if model_name == "EBM": + return ExplainableBoostingClassifier(random_state=cv_config.get("SEED"), + interactions=6, + learning_rate=0.02, + min_samples_leaf=5, + n_jobs=-1) + elif model_name == "LR": + return LogisticRegression(penalty="elasticnet", + solver="saga", + l1_ratio=0.3, + max_iter=10000, + random_state=cv_config.get("SEED")) + elif model_name == "SVC": + return SVC(probability=True, + class_weight="balanced", + random_state=cv_config.get("SEED")) + elif model_name == "RF": + return RandomForestClassifier(random_state=cv_config.get("SEED"), + n_estimators=5, + max_depth=3, + n_jobs=-1) + elif model_name == "KNN": + return KNeighborsClassifier(n_jobs=-1) + else: + raise ValueError(f"{model_name} is not a valid estimator name.") + cv_config = { - "N_REPEATS": 10, + "N_REPEATS": 2, "N_SPLITS": 10, "SHUFFLE": True, "SCALE": False, "CALIBRATE": True, "CALIB_FRACTION": 0.15, + "SEED": 1303 } -for i, (train_idx, val_idx) in enumerate(outer_cv.split(X, y)): - Xtemp, ytemp = X.iloc[train_idx, :], y.iloc[train_idx] - Xval, yval = X.iloc[val_idx, :], y.iloc[val_idx] - - if cv_config.get("CALIBRATE"): - Xtrain, Xcal, ytrain, ycal = train_test_split(Xtemp, ytemp, - test_size=cv_config.get("CALIB_FRACTION"), - stratify=ytemp, - random_state=settings.misc.seed) - else: - Xtrain, ytrain = Xtemp, ytemp - - model = get_model(seed=settings.misc.seed) - - # select features - feat_subset = select_features(Xtrain, ytrain, - selector=get_feature_selector(settings.misc.seed), - cv=inner_cv, - scale=False, - frac=1.25) - features.append(feat_subset) - - if cv_config.get("SCALE"): - numeric_cols = Xtrain.select_dtypes(include=[np.float64, np.int64]).columns.tolist() - scaler = StandardScaler() - Xtrain.loc[:, numeric_cols] = scaler.fit_transform(Xtrain.loc[:, numeric_cols]) - Xtest.loc[:, numeric_cols] = scaler.transform(Xtest.loc[:, numeric_cols]) - - model.fit(Xtrain[feat_subset], ytrain) - - if cv_config.get("CALIBRATE"): - model = get_calibrated_model(model, - X=Xcal.loc[:, feat_subset], - y=ycal) - - models.append(model) - - # accumulate ground-truths - gts_train.append(ytrain) - gts_val.append(yval) - - # accumulate predictions - train_pred_probas = model.predict_proba(Xtrain)[:, 1] - val_pred_probas = model.predict_proba(Xval)[:, 1] - - probas_train.append(train_pred_probas) - probas_val.append(val_pred_probas) +# define outer and inner cross-validation schemes +outer_cv = RepeatedStratifiedKFold(n_repeats=cv_config.get("N_REPEATS"), + n_splits=cv_config.get("N_SPLITS"), + random_state=cv_config.get("SEED")) + +inner_cv = StratifiedKFold(n_splits=cv_config.get("N_SPLITS"), + shuffle=cv_config.get("SHUFFLE"), + random_state=cv_config.get("SEED")) + +# define models names, factory and cv results dictionary +models_names = ["EBM", "LR", "SVC", "RF", "KNN"] +model_factory = BinaryModelFactory() +cv_results = dict() + +for model_name in models_names: + print(f"Processing model {model_name}\n") + + gts_train = [] + gts_val = [] + probas_train = [] + probas_val = [] + gts_train_conc = [] + gts_val_conc = [] + probas_train_conc = [] + probas_val_conc = [] + + models = [] + errors = [] + correct = [] + features = [] + + for i, (train_idx, val_idx) in enumerate(outer_cv.split(X, y)): + Xtemp, ytemp = X.iloc[train_idx, :], y.iloc[train_idx] + Xval, yval = X.iloc[val_idx, :], y.iloc[val_idx] + + if cv_config.get("CALIBRATE"): + Xtrain, Xcal, ytrain, ycal = train_test_split(Xtemp, ytemp, + test_size=cv_config.get( + "CALIB_FRACTION"), + stratify=ytemp, + random_state=cv_config.get("SEED")) + else: + Xtrain, ytrain = Xtemp, ytemp + + model = model_factory.get_model(model_name) + + # select features + feat_subset = select_features(Xtrain, ytrain, + selector=get_feature_selector( + cv_config.get("SEED")), + cv=inner_cv, + scale=False, + frac=1.25) + features.append(feat_subset) + + if cv_config.get("SCALE"): + numeric_cols = Xtrain.select_dtypes( + include=[np.float64, np.int64]).columns.tolist() + scaler = StandardScaler() + Xtrain.loc[:, numeric_cols] = scaler.fit_transform( + Xtrain.loc[:, numeric_cols]) + Xtest.loc[:, numeric_cols] = scaler.transform( + Xtest.loc[:, numeric_cols]) + + model.fit(Xtrain.loc[:, feat_subset], ytrain) + + if cv_config.get("CALIBRATE"): + model = get_calibrated_model(model, + X=Xcal.loc[:, feat_subset], + y=ycal) + + models.append(model) + + # accumulate ground-truths + gts_train.append(ytrain) + gts_val.append(yval) + + # accumulate predictions + train_pred_probas = model.predict_proba(Xtrain.loc[:, feat_subset])[:, 1] + val_pred_probas = model.predict_proba(Xval.loc[:, feat_subset])[:, 1] + + probas_train.append(train_pred_probas) + probas_val.append(val_pred_probas) + + # identify correct and erroneous predictions according to the + # classification cut-off that maximizes the Youden's J index + fpr, tpr, thresholds = roc_curve(ytrain, train_pred_probas) + idx = np.argmax(tpr - fpr) + youden = thresholds[idx] + + labels_val = np.where(val_pred_probas >= youden, 1, 0) + + # indexes of validation instances misclassified by the model + error_idxs = Xval[(yval != labels_val)].index + errors.append(error_idxs) + + # indexes of correct predictions + correct.append(Xval[(yval == labels_val)].index) + + # CV results for current model + curr_est_results = CVModellingOutput( + gts_train=gts_train, + gts_val=gts_val, + probas_train=probas_train, + probas_val=probas_val, + gts_train_conc=np.concatenate(gts_train), + gts_val_conc=np.concatenate(gts_val), + probas_train_conc=np.concatenate(probas_train), + probas_val_conc=np.concatenate(probas_val), + models=models, + errors=errors, + correct=correct, + features=features + ) - # identify correct and erroneous predictions according to the - # classification cut-off that maximizes the Youden's J index - fpr, tpr, thresholds = roc_curve(ytrain, train_pred_probas) - idx = np.argmax(tpr - fpr) - youden = thresholds[idx] - - labels_val = np.where(val_pred_probas >= youden, 1, 0) + cv_results[model_name] = curr_est_results +``` - # indexes of validation instances misclassified by the model - error_idxs = Xval[(yval != labels_val)].index - errors.append(error_idxs) - - # indexes of correct predictions - correct.append(Xval[(yval == labels_val)].index) - - -cv_results = CVModellingOutput( - gts_train = np.array(gts_train), - gts_val = np.array(gts_val), - probas_train = np.array(probas_train), - probas_val = np.array(probas_val), - gts_train_conc = np.concatenate(gts_train), - gts_val_conc = np.concatenate(gts_val), - probas_train_conc = np.concatenate(probas_train), - probas_val_conc = np.concatenate(probas_val), - models = models, - errors = np.array(errors), - correct = np.array(correct), - features = np.array(features) +# Plot the average receiver-operating characteristic (ROC) curves +```python +from modelsight.curves import ( + average_roc_curves, + roc_comparisons, + add_annotations ) -# Plot the average receiver-operating characteristic (ROC) curve -model_names = ["EBM"] +model_names = list(cv_results.keys()) alpha = 0.05 alph_str = str(alpha).split(".")[1] alpha_formatted = f".{alph_str}" roc_symbol = "*" +palette = [Colors.green2, Colors.blue, Colors.yellow, Colors.violet, Colors.darksalmon] n_boot = 100 -kwargs = dict() f, ax = plt.subplots(1, 1, figsize=(8, 8)) -f, ax, barplot, bars, aucs_cis = average_roc_curves(cv_results, +kwargs = dict() + +f, ax, barplot, bars, all_data = average_roc_curves(cv_results, colors=palette, model_keys=model_names, show_ci=True, n_boot=n_boot, bars_pos=[ - 0.5, 0.01, 0.4, 0.1*len(model_names)], - random_state=settings.misc.seed, + 0.3, 0.01, 0.6, 0.075*len(model_names)], + random_state=cv_config.get("SEED"), ax=ax, **kwargs) + +roc_comparisons_results = roc_comparisons(cv_results, "EBM") + +kwargs = dict(space_between_whiskers = 0.07) +order = [ + ("EBM", "RF"), + ("EBM", "SVC"), + ("EBM", "LR"), + ("EBM", "KNN") +] +ax_annot = add_annotations(roc_comparisons_results, + alpha = 0.05, + bars=bars, + direction = "vertical", + order = order, + symbol = roc_symbol, + symbol_fontsize = 30, + voffset = -0.05, + ext_voffset=0, + ext_hoffset=0, + ax=barplot, + **kwargs) ``` +This produces the following plot: +

+Average ROC plot +

+ +### Assess calibration of the ExplaianbleBoostingClassifier across +We will now compute median Brier score (95% CI) of the ExplainableBoostingClassifier and use it to annotate the calibration plot. +```python +from sklearn.metrics import brier_score_loss +from modelsight.calibration import hosmer_lemeshow_plot + +briers = [] +for gt, preds in zip(cv_results["EBM"].gts_val, cv_results["EBM"].probas_val): + brier = brier_score_loss(gt, preds) + briers.append(brier) + +brier_low, brier_med, brier_up = np.percentile(briers, [2.5, 50, 97.5]) + +brier_annot = f"{brier_med:.2f} ({brier_low:.2f} - {brier_up:.2f})" + +f, ax = plt.subplots(1, 1, figsize=(11,6)) + +f, ax = hosmer_lemeshow_plot(cv_results["EBM"].gts_val_conc, + cv_results["EBM"].probas_val_conc, + n_bins=10, + colors=(Colors.darksalmon, Colors.green2), + annotate_bars=True, + title="", + brier_score_annot=brier_annot, + ax=ax + ) +``` +This produces the following plot: +

+Average ROC plot +

+ ## :gift_heart: Contributing From 9fc34fd5276d33c920653b847d112dc03c9ba92d Mon Sep 17 00:00:00 2001 From: Francesco Pisu Date: Sat, 23 Sep 2023 19:00:42 +0200 Subject: [PATCH 2/2] docs: improve docstring docs #13 --- src/modelsight/_typing.py | 36 +++- src/modelsight/calibration/calib.py | 42 +++-- src/modelsight/curves/_delong.py | 259 ++++++++++++++++------------ src/modelsight/curves/compare.py | 245 ++++++++++++++++++++++---- 4 files changed, 429 insertions(+), 153 deletions(-) diff --git a/src/modelsight/_typing.py b/src/modelsight/_typing.py index e9ef2e4..a87c1ba 100644 --- a/src/modelsight/_typing.py +++ b/src/modelsight/_typing.py @@ -1,3 +1,7 @@ +""" +This file deals with the implementation of custom types. +""" + import sys import random import numpy as np @@ -24,7 +28,37 @@ @dataclass -class CVModellingOutput: +class CVModellingOutput: + """This class stores the data generated by a cross-validation + process for a single estimator. + + Arguments + --------- + gts_train: ArrayLike + A (n_repetitions * n_outer_splits) list of arrays representing training ground-truth. + gts_val: ArrayLike + A (n_repetitions * n_outer_splits) list of arrays representing validation ground-truth. + gts_train_conc: ArrayLike + A list of (n_repetitions * n_outer_splits) data points representing pooled training ground-truth. + gts_val_conc: ArrayLike + A list of (n_repetitions * n_outer_splits) data points representing pooled validation ground-truth. + probas_train: ArrayLike + A (n_repetitions * n_outer_splits) list of arrays representing training predicted probabilities. + probas_val: ArrayLike + A (n_repetitions * n_outer_splits) list of arrays representing validation predicted probabilities. + probas_train_conc: ArrayLike + A list of (n_repetitions * n_outer_splits) data points representing pooled training predicted probabilities. + probas_val_conc: ArrayLike + A list of (n_repetitions * n_outer_splits) data points representing pooled validation predicted probabilities. + models: List[Estimator] + A list of (n_repetitions * n_outer_splits) fitted estimators. + errors: Optional[ArrayLike] + A (n_repetitions * n_outer_splits) list of validation prediction errors. + correct: Optional[ArrayLike] + A (n_repetitions * n_outer_splits) list of validation correct predictions. + features: Optional[ArrayLike] + A (n_repetitions * n_outer_splits) list of subsets of selected features. + """ gts_train: ArrayLike gts_val: ArrayLike gts_train_conc: ArrayLike diff --git a/src/modelsight/calibration/calib.py b/src/modelsight/calibration/calib.py index ba9cbd4..37f9a60 100644 --- a/src/modelsight/calibration/calib.py +++ b/src/modelsight/calibration/calib.py @@ -1,3 +1,8 @@ +""" +This file deals with the implementation of the Hosmer-Lemeshow plot for the +assessment of calibration of predicted probabilities. +""" + import numpy as np from typing import Tuple import matplotlib.pyplot as plt @@ -6,11 +11,14 @@ def ntile_name(n: int) -> str: - """Returns the ntile name corresponding to an ntile integer. + """ + Returns the ntile name corresponding to an ntile integer. + Parameters ---------- n : int An ntile integer. + Returns ------- ntile_name : str @@ -30,13 +38,16 @@ def ntile_name(n: int) -> str: def make_recarray(y_true: ArrayLike, y_pred: ArrayLike) -> np.recarray: - """Combines arrays into a recarray. + """ + Combines arrays into a recarray. + Parameters ---------- y_true : array Observed labels, either 0 or 1. y_pred : array Predicted probabilities, floats on [0, 1]. + Returns ------- table : recarray @@ -53,7 +64,9 @@ def make_recarray(y_true: ArrayLike, def hosmer_lemeshow_table(y_true: ArrayLike, y_pred: ArrayLike, n_bins: int = 10) -> np.recarray: - """Constructs a Hosmer–Lemeshow table. + """ + Constructs a Hosmer–Lemeshow table. + Parameters ---------- y_true : array @@ -63,6 +76,7 @@ def hosmer_lemeshow_table(y_true: ArrayLike, n_bins : int, optional The number of groups to create. The default value is 10, which corresponds to deciles of predicted probabilities. + Returns ------- table : recarray @@ -100,26 +114,28 @@ def hosmer_lemeshow_plot(y_true: ArrayLike, Parameters ---------- - y_true: ArrayLike + y_true : ArrayLike (n_obs,) shaped array of ground-truth values - y_pred: ArrayLike + y_pred : ArrayLike (n_obs,) shaped array of predicted probabilities - n_bins: int + n_bins : int Number of bins to group observed and predicted probabilities into - colors: Tuple[str, str] + colors : Tuple[str, str] Pair of colors for observed (line) and predicted (vertical bars) probabilities. - annotate_bars: bool + annotate_bars : bool Whether bars should be annotated with the number of observed probabilities in each bin. - title: str + title : str Title to display on top of the calibration plot. - brier_score_annot: str + brier_score_annot : str Optional brier score (95% CI) annotation on the top-left corner. - ax: plt.Axes + ax : plt.Axes A matplotlib Axes object to draw the calibration plot into. If None, an Axes object is created by default. + Returns ------- - Tuple[plt.Figure, plt.Axes]: - Corresponding figure and Axes + f, ax : Tuple[plt.Figure, plt.Axes] + f: pyplot figure + ax: pyplot Axes """ table = hosmer_lemeshow_table(y_true, y_pred, n_bins) # transform observed and predicted frequencies in percentage relative to the bin dimension diff --git a/src/modelsight/curves/_delong.py b/src/modelsight/curves/_delong.py index 77dd1f3..bdffa34 100644 --- a/src/modelsight/curves/_delong.py +++ b/src/modelsight/curves/_delong.py @@ -1,124 +1,169 @@ +""" +This file deals with the implementation of the DeLong test for the comparison of +pairs of correlated areas under the receiver-operating characteristics curves. +""" + import pandas as pd import numpy as np import scipy.stats +from typing import Tuple # AUC comparison adapted from # https://github.com/Netflix/vmaf/ -def compute_midrank(x): - """Computes midranks. - Args: - x - a 1D numpy array - Returns: - array of midranks - """ - J = np.argsort(x) - Z = x[J] - N = len(x) - T = np.zeros(N, dtype=np.float64) - i = 0 - while i < N: - j = i - while j < N and Z[j] == Z[i]: - j += 1 - T[i:j] = 0.5*(i + j - 1) - i = j - T2 = np.empty(N, dtype=np.float64) - # Note(kazeevn) +1 is due to Python using 0-based indexing - # instead of 1-based in the AUC formula in the paper - T2[J] = T + 1 - return T2 +def compute_midrank(x: np.ndarray) -> np.ndarray: + """ + Computes midranks. + + Parameters + ---------- + x : np.ndarray + a 1-d array of predicted probabilities. + + Returns + ------- + T2 : np.ndarray + array of midranks + """ + J = np.argsort(x) + Z = x[J] + N = len(x) + T = np.zeros(N, dtype=np.float64) + i = 0 + while i < N: + j = i + while j < N and Z[j] == Z[i]: + j += 1 + T[i:j] = 0.5*(i + j - 1) + i = j + T2 = np.empty(N, dtype=np.float64) + # Note(kazeevn) +1 is due to Python using 0-based indexing + # instead of 1-based in the AUC formula in the paper + T2[J] = T + 1 + return T2 -def fastDeLong(predictions_sorted_transposed, label_1_count): - """ - The fast version of DeLong's method for computing the covariance of - unadjusted AUC. - Args: - predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples] - sorted such as the examples with label "1" are first - Returns: - (AUC value, DeLong covariance) - Reference: - @article{sun2014fast, - title={Fast Implementation of DeLong's Algorithm for - Comparing the Areas Under Correlated Receiver Operating Characteristic Curves}, - author={Xu Sun and Weichao Xu}, - journal={IEEE Signal Processing Letters}, - volume={21}, - number={11}, - pages={1389--1393}, - year={2014}, - publisher={IEEE} - } - """ - # Short variables are named as they are in the paper - m = label_1_count - n = predictions_sorted_transposed.shape[1] - m - positive_examples = predictions_sorted_transposed[:, :m] - negative_examples = predictions_sorted_transposed[:, m:] - k = predictions_sorted_transposed.shape[0] +def fastDeLong(predictions_sorted_transposed: np.ndarray, + label_1_count: int) -> Tuple[np.ndarray, np.ndarray]: + """ + The fast version of DeLong's method for computing the covariance of + unadjusted AUC. + + Parameters + ---------- + predictions_sorted_transposed : a (n_classifiers, n_obs) numpy array containing + the predicted probabilities by the two classifiers in the comparison. + These probabilities are sorted such that the examples with label "1" come first. + + Returns + ------- + aucs, delongcov : Tuple[np.ndarray, np.ndarray] + aucs: array of AUC values + delongcov: array of DeLong covariance + + Reference + --------- + @article{sun2014fast, + title={Fast Implementation of DeLong's Algorithm for + Comparing the Areas Under Correlated Receiver Operating Characteristic Curves}, + author={Xu Sun and Weichao Xu}, + journal={IEEE Signal Processing Letters}, + volume={21}, + number={11}, + pages={1389--1393}, + year={2014}, + publisher={IEEE} + } + """ + # Short variables are named as they are in the paper + m = label_1_count + n = predictions_sorted_transposed.shape[1] - m + positive_examples = predictions_sorted_transposed[:, :m] + negative_examples = predictions_sorted_transposed[:, m:] + k = predictions_sorted_transposed.shape[0] - tx = np.empty([k, m], dtype=np.float64) - ty = np.empty([k, n], dtype=np.float64) - tz = np.empty([k, m + n], dtype=np.float64) - for r in range(k): - tx[r, :] = compute_midrank(positive_examples[r, :]) - ty[r, :] = compute_midrank(negative_examples[r, :]) - tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :]) - aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n - v01 = (tz[:, :m] - tx[:, :]) / n - v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m - sx = np.cov(v01) - sy = np.cov(v10) - delongcov = sx / m + sy / n - return aucs, delongcov + tx = np.empty([k, m], dtype=np.float64) + ty = np.empty([k, n], dtype=np.float64) + tz = np.empty([k, m + n], dtype=np.float64) + for r in range(k): + tx[r, :] = compute_midrank(positive_examples[r, :]) + ty[r, :] = compute_midrank(negative_examples[r, :]) + tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :]) + aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n + v01 = (tz[:, :m] - tx[:, :]) / n + v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m + sx = np.cov(v01) + sy = np.cov(v10) + delongcov = sx / m + sy / n + return aucs, delongcov -def calc_pvalue(aucs, sigma): - """Computes log(10) of p-values. - Args: - aucs: 1D array of AUCs - sigma: AUC DeLong covariances - Returns: - log10(pvalue) - """ - l = np.array([[1, -1]]) - z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T)) - return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10) +def calc_pvalue(aucs: np.ndarray, sigma: np.ndarray) -> float: + """ + Computes log(10) of p-values. + + Parameters + ---------- + aucs : np.array + a 1-d array of AUCs + sigma : np.array + an array AUC DeLong covariances + + Returns + ------- + p : float + log10(pvalue) + """ + l = np.array([[1, -1]]) + z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T)) + p = np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10) + return p -def compute_ground_truth_statistics(ground_truth): - assert np.array_equal(np.unique(ground_truth), [0, 1]) - order = (-ground_truth).argsort() - label_1_count = int(ground_truth.sum()) - return order, label_1_count +def compute_ground_truth_statistics(ground_truth: np.ndarray) -> Tuple[np.ndarray, int]: + """ + Compute statistics of ground-truth array. + Parameters + ---------- + ground_truth : np.ndarray + a (n_obs,) array of 0 and 1 values representing the ground-truth. -def delong_roc_variance(ground_truth, predictions): - """ - Computes ROC AUC variance for a single set of predictions - Args: - ground_truth: np.array of 0 and 1 - predictions: np.array of floats of the probability of being class 1 - """ - order, label_1_count = compute_ground_truth_statistics(ground_truth) - predictions_sorted_transposed = predictions[np.newaxis, order] - aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) - assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers" - return aucs[0], delongcov + Returns + ------- + order, label_1_count : Tuple[np.ndarray, int] + order is a numpy array of sorted indexes + label_1_count is the count of data points of the positive class. + """ + assert np.array_equal(np.unique(ground_truth), [0, 1]) + order = (-ground_truth).argsort() + label_1_count = int(ground_truth.sum()) + return order, label_1_count -def delong_roc_test(ground_truth, predictions_one, predictions_two): - """ - Computes log(p-value) for hypothesis that two ROC AUCs are different - Args: - ground_truth: np.array of 0 and 1 - predictions_one: predictions of the first model, - np.array of floats of the probability of being class 1 - predictions_two: predictions of the second model, - np.array of floats of the probability of being class 1 - """ - order, label_1_count = compute_ground_truth_statistics(ground_truth) - predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order] - aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) - return 10**calc_pvalue(aucs, delongcov).item() \ No newline at end of file +def delong_roc_test(ground_truth: np.ndarray, + predictions_one: np.ndarray, + predictions_two: np.ndarray) -> float: + """ + Compare areas-under-curve of two estimators using the DeLong test. + Concretely, it computes the pvalue for hypothesis that two ROC AUCs are different. + + Parameters + ---------- + ground_truth : np.ndarray + a (n_obs,) array of 0 and 1 representing ground-truths. + predictions_one : np.ndarray + a (n_obs,) array of probabilities of class 1 predicted by the first model. + predictions_two : np.ndarray + a (n_obs,) array of probabilities of class 1 predicted by the second model. + + Returns + ------- + p : float + the p-value for hypothesis that two ROC AUCs are different. + """ + order, label_1_count = compute_ground_truth_statistics(ground_truth) + predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order] + aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count) + + p = 10**calc_pvalue(aucs, delongcov).item() + return p \ No newline at end of file diff --git a/src/modelsight/curves/compare.py b/src/modelsight/curves/compare.py index 4d748ec..aa298b8 100644 --- a/src/modelsight/curves/compare.py +++ b/src/modelsight/curves/compare.py @@ -1,37 +1,84 @@ +""" +This file deals with the implementation of functions that allow annotating plots +with statistical tests results between pairs of estimators. +""" + from typing import Callable, Dict, Tuple, List import matplotlib from matplotlib import patches import matplotlib.pyplot as plt -from scipy.stats import ttest_ind -from sklearn.metrics import average_precision_score from src.modelsight.curves._delong import delong_roc_test +from src.modelsight._typing import CVModellingOutput -def annot_stat_vertical(text, x, y1, y2, ww, - col='k', - fontsize=13, - voffset = 0, - n_elems = None, +def annot_stat_vertical(text:str, + x: float, + y1: float, y2: float, + ww: float = 0.02, + col: str = 'black', + fontsize: int = 13, + voffset: float = 0, + n_elems: int = None, ax=None, **kwargs): """ - ww: float - whisker width + Draw a vertical whisker at position `x` that spans through `y1` to `y2` with annotation specified + by `text`. + + Parameters + ---------- + text : str + Annotation for whisker. + x : float + x-position the whisker is positioned at. + y1 :float + starting y position. + y2 : float + ending y position. + ww : float, optional + whisker width, by default 0.02 + col : str, optional + whisker color, by default 'black' + fontsize : int, optional + fontsize for the annotation, by default 13 + voffset : float, optional + vertical offset for the annotation, by default 0. + Some font families and characters occupy different vertical spaces; + this parameter allows compensating for such variations. + n_elems : int, optional + number of discrete elements in the y-axis, by default None. + This value is precomputed by the caller (add_annotations) and passed + to this function as input. + ax : plt.Axes, optional + a pyplot Axes to draw annotations on, by default None + **kwargs + rect_h_base: float, optional + base height of rectangle patch for single-character annotations, by default 0.1 + fontsize_nonsignif, optional + fontsize for multi-character annotations (here called non significant annotations + to reflect the fact that single-character annotations most often use some kind + of symbol to denote statistical significance, e.g. *), by default `fontsize` (i.e., 13) """ ax = plt.gca() if ax is None else ax # we want the text to be centered on the whisker text_x_pos = x + ww - #+ 0.01 + text_y_pos = (y1+y2)/2 # draw whisker from y1 to y2 with width `ww` ax.plot([x, x + ww, x + ww, x], [y1, y1, y2, y2], lw=1, c=col) - if len(text) == 1: - #text_y_pos = (y1+y2)/2 - - # draw text at (text_x_pos, text_y_pos) # + 0.15 + # this is the case of a whisker being annotated with a single character. + # by default, symbols do not enforce a white background, hence when + # superimposed on whiskers the readibility is limited. + # here we enforce a white rectangle patch beneath the symbol to enhance + # readibility of annotations. + # the built-in bbox parameter of pyplot's .text() doesn't produce + # acceptable results, hence we came up with a custom implementation for + # single-character annotations. + if len(text) == 1: + # draw text at (text_x_pos, (text_y_pos - voffset) + 0.17) ax.text( text_x_pos, (text_y_pos - voffset) + 0.17, text, ha='center', va='center', color=col, @@ -61,6 +108,9 @@ def annot_stat_vertical(text, x, y1, y2, ww, ax.add_patch(rect) else: + # this is the case of multi-character annotations. + # here, we leverage the built-in bbox of pyplot's text method + # that allows drawing a bounding box beneath the annotation. fontsize_nonsignif = kwargs.pop("fontsize_nonsignif", fontsize) ax.text( text_x_pos, text_y_pos, text, @@ -73,15 +123,52 @@ def annot_stat_vertical(text, x, y1, y2, ww, ) ) -from matplotlib import patches -def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13, - voffset = 0, - n_elems = None, - ax=None, - **kwargs): + +def annot_stat_horizontal(text: str, + x1: float, x2: float, + y: float, + wh: float = 0.02, + col: str = "black", + fontsize: int = 13, + voffset: float = 0, + n_elems:int = None, + ax: plt.Axes = None, + **kwargs): """ - ww: float - whisker width + Draw an horizontal whisker at position `y` that spans through `x1` to `x2` with annotation specified + by `text`. + + Parameters + ---------- + text : str + Annotation for whisker. + x1 : float + starting x position. + x2 :float + ending x position. + y : float + y-position the whisker is positioned at. + wh : float, optional + whisker height, by default 0.02 + col : str, optional + whisker color, by default 'black' + fontsize : int, optional + fontsize for the annotation, by default 13 + voffset : float, optional + vertical offset for the annotation, by default 0. + Some font families and characters occupy different vertical spaces; + this parameter allows compensating for such variations. + n_elems : int, optional + number of discrete elements in the y-axis, by default None. + This value is precomputed by the caller (add_annotations) and passed + to this function as input. + ax : plt.Axes, optional + a pyplot Axes to draw annotations on, by default None + **kwargs + fontsize_nonsignif, optional + fontsize for multi-character annotations (here called non significant annotations + to reflect the fact that single-character annotations most often use some kind + of symbol to denote statistical significance, e.g. *), by default `fontsize` (i.e., 13) """ ax = plt.gca() if ax is None else ax @@ -93,10 +180,16 @@ def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13, # draw whisker from y1 to y2 with width `ww` ax.plot([x1, x1, x2, x2], [y, y + wh, y + wh, y], lw=1, c=col, clip_on=False) - - if len(text) == 1: - #text_y_pos = (y1+y2)/2 + # this is the case of a whisker being annotated with a single character. + # by default, symbols do not enforce a white background, hence when + # superimposed on whiskers the readibility is limited. + # here we enforce a white rectangle patch beneath the symbol to enhance + # readibility of annotations. + # the built-in bbox parameter of pyplot's .text() doesn't produce + # acceptable results, hence we came up with a custom implementation for + # single-character annotations. + if len(text) == 1: # draw text at (text_x_pos, text_y_pos) # + 0.15 ax.text( text_x_pos, text_y_pos + voffset, text, @@ -140,14 +233,12 @@ def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13, ) -from typing import Tuple, List, Dict - def add_annotations(comparisons: Dict[str, Tuple[str, str, float]], alpha: float, bars: matplotlib.container.BarContainer, direction: str, order: List[Tuple[str, str]], - symbol: str, + symbol: str = "*", symbol_fontsize: int = 22, voffset: float = 0, ext_voffset: float = 0, @@ -155,6 +246,54 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]], P_val_rounding: int = 2, ax: plt.Axes = None, **kwargs): + """ + Annotates the specified plot (`ax`) with the provided comparisons results either vertically or horizontally + depending on the value of `direction`. + + Parameters + ---------- + comparisons : Dict[str, Tuple[str, str, float]] + The results of models comparisons. + alpha : float + The significance level used for formatting the P value of comparisons. + bars : matplotlib.container.BarContainer + A list of matplotlib's bars that is used to access the bar's width or height + when annotating horizontally and vertically, respectively. + direction : str + The direction for annotation. Possible values are "horizontal" and "vertical". + order : List[Tuple[str, str]] + The order in which the comparisons should be displayed. + Each entry of this list is a tuple where elements are algorithm's names. + symbol : str, optional + The symbol used in place of the P value when statistical significance is achieved + accoring to the specified alpha, by default "*". + symbol_fontsize : int, optional + Fontsize for the symbol used when statistical significance is achieved, by default 22 + voffset : float, optional + vertical offset for the annotation, by default 0., by default 0 + ext_voffset : float, optional + Additional vertical offset for vertical annotations. + Ignored when direction = "horizontal", by default 0 + ext_hoffset : float, optional + Additional horizontal offset for horizontal annotations. + Ignored when direction = "vertical", by default 0 + P_val_rounding : int, optional + Number of decimal places to round P values at, by default 2 + ax : plt.Axes, optional + The plot to be annotated, by default None + + Returns + ------- + ax : plt.Axes + The annotated plot. + + Raises + ------ + ValueError + When ax is None + ValueError + Whenever a comparison key doesn't exist. + """ if not ax: raise ValueError("I need an Axes to draw comparisons on.") @@ -188,7 +327,7 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]], wh=0.02, col="black", fontsize=symbol_fontsize, - voffset = -0.02, + voffset = voffset, #-0.02 ext_offset = ext_hoffset, n_elems = len(entity_labels), ax=ax, @@ -219,16 +358,58 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]], return ax -def roc_single_comparison(cv_preds, fst_algo, snd_algo): +def roc_single_comparison(cv_preds: CVModellingOutput, + fst_algo: str, + snd_algo: str) -> Dict[str, Tuple[str, str, float]]: + """Perform a single comparison of two areas under Receiver Operating Characteristic curves + computed on the same set of data points by the DeLong test. + + Parameters + ---------- + cv_preds : CVModellingOutput + The output of a cross-validation process encompassing mulitple (n>=2) models. + fst_algo : str + The name of the first algorithm for the comparison. + Must be an existing key of `cv_preds`. + snd_algo : str + The name of the second algorithm for the comparison. + Must be an existing key of `cv_preds`. + + Returns + ------- + comparison_result : Dict[str, Tuple[str, str, float]] + The output of the comparison. This is a dictionary where the key is + of the form "_" and the value is a tuple of three + elements, the first two are the names of the algorithms being compared + and the third element is the P value for the null hypothesis that + the two AUC values are equal. + """ ground_truths = cv_preds[fst_algo].gts_val_conc fst_algo_probas = cv_preds[fst_algo].probas_val_conc snd_algo_probas = cv_preds[snd_algo].probas_val_conc P = delong_roc_test(ground_truths, fst_algo_probas, snd_algo_probas) cmp_key = f"{fst_algo}_{snd_algo}" - return {cmp_key: (fst_algo, snd_algo, P)} + comparison_result = {cmp_key: (fst_algo, snd_algo, P)} + return comparison_result -def roc_comparisons(cv_preds, target_algo): +def roc_comparisons(cv_preds: CVModellingOutput, + target_algo: str): + """ + Compares the AUC of the specified algorithm with the AUCs of all other algorithms. + + Parameters + ---------- + cv_preds : CVModellingOutput + The output of a cross-validation process encompassing mulitple (n>=2) models. + target_algo : str + The name of the target algorithm's whose AUC will be compared with all other AUCs. + + Returns + ------- + comparisons : Dict[str, Tuple[str, str, float]] + A dictionary containing the results of all comparisons. See output of `roc_single_comparison`. + """ comparisons = dict() for algo_name in cv_preds.keys():