From 0cabbb02a2b48617110bac0738a2c2b9eeb144dc Mon Sep 17 00:00:00 2001
From: Francesco Pisu <frapisu9@gmail.com>
Date: Sun, 17 Sep 2023 09:40:11 +0200
Subject: [PATCH 1/2] docs: update README with graphical examples #10

---
 README.md | 351 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 240 insertions(+), 111 deletions(-)

diff --git a/README.md b/README.md
index c5d42ef..1a0cb90 100644
--- a/README.md
+++ b/README.md
@@ -4,17 +4,22 @@
 
 > Better insights into Machine Learning models performance.
 
-Modelsight is a collection of functions that create publication-ready graphics for the visual evaluation of binary classifiers adhering to the scikit-learn interface. 
+Modelsight is a collection of functions that create publication-ready graphics for the visual evaluation of cross-validated binary classifiers adhering to the scikit-learn interface. 
 
-Modelsight is strongly oriented towards the evaluation of already fitted `ExplainableBoostingClassifier` of the [interpretml](https://github.com/interpretml/interpret) package.
+While the majority of `modelsight` functionalities are compatible with any binary classifier that follows scikit-learn's interface, it's worth noting that, as of now, some features are exclusive to ExplainableBoostingClassifier from the interpretml package.
 
 | Overview | |
 |---|---|
+| **Open Source** | [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) |
+| **CI/CD** | [![github-actions](https://img.shields.io/github/actions/workflow/status/francescopisu/modelsight/ci-cd.yml?logo=github)](https://github.com/francescopisu/modelsight/actions/workflows/ci-cd.yml) [![!codecov](https://img.shields.io/codecov/c/github/francescopisu/modelsight?label=codecov&logo=codecov)](https://codecov.io/gh/francescopisu/modelsight) [![readthedocs](https://img.shields.io/readthedocs/modelsight?logo=readthedocs)](https://modelsight.readthedocs.io/en/latest/) |
 | **Code** |  [![!pypi](https://img.shields.io/pypi/v/modelsight?color=orange)](https://pypi.org/project/modelsight/) [![!python-versions](https://img.shields.io/pypi/pyversions/modelsight)](https://www.python.org/) |
 
 ## 💫 Features
-Our goal is to streamline the visual assessment of binary classifiers by creating a set of functions designed to generate publication-ready plots. 
+Our objective is to enhance the visual assessment of cross-validated binary classifiers by introducing a set of functions tailored for generating publication-ready plots.
 
+To effectively utilize modelsight, you should provide a dictionary where the keys represent model names, and the values consist of CVModellingOutput objects. CVModellingOutput is a custom data class designed to encapsulate the results of a cross-validation process for a specific binary classifier. For detailed information about CVModellingOutput, please refer to the [API reference](https://modelsight.readthedocs.io/en/latest/autoapi/modelsight/_typing/index.html#modelsight._typing.CVModellingOutput).
+
+Before using the functionalities offered by this library, ensure that your cross-validation data adheres to this structure.
 
 | Module | Status | Links |
 |---|---|---|
@@ -38,156 +43,280 @@ $ pip install modelsight
 ```
 
 ## :zap: Quickstart
+The following code will cross-validate five different binary classifiers and produce a dictionary of CVModellingOutput.
+
 ```python
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
 from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
 from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
+from sklearn.metrics import roc_curve
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.linear_model import LogisticRegression
 from interpret.glassbox import ExplainableBoostingClassifier
 
-from utils import (
+from tests.utils import (
     select_features, 
     get_feature_selector, 
-    get_model, 
     get_calibrated_model
 )
 
-from modelsight.curves import average_roc_curves
-from modelsight.config import settings
 from modelsight._typing import CVModellingOutput, Estimator
 
+# Load data
 X, y = load_breast_cancer(return_X_y=True, as_frame=True)
 
-outer_cv = RepeatedStratifiedKFold(n_repeats=cv_config.get("N_REPEATS"), 
-                        n_splits=cv_config.get("N_SPLITS"), 
-                        random_state=settings.misc.seed)
-inner_cv = StratifiedKFold(n_splits=cv_config.get("N_SPLITS"), 
-                        shuffle=cv_config.get("SHUFFLE"),
-                        random_state=settings.misc.seed)
-
-gts_train = []
-gts_val = []
-probas_train = []
-probas_val = []
-gts_train_conc = []
-gts_val_conc = []
-probas_train_conc = []
-probas_val_conc = []
-
-models = []
-errors = []
-correct = []
-features = []
-
-ts = datetime.timestamp(datetime.now())
+# Define factory for binary classifiers
+class BinaryModelFactory:
+    def get_model(self, model_name: str) -> Estimator:
+        if model_name == "EBM":
+            return ExplainableBoostingClassifier(random_state=cv_config.get("SEED"),
+                                        interactions=6,
+                                        learning_rate=0.02,
+                                        min_samples_leaf=5,
+                                        n_jobs=-1)
+        elif model_name == "LR":
+            return LogisticRegression(penalty="elasticnet",
+                                 solver="saga",
+                                 l1_ratio=0.3,
+                                 max_iter=10000,
+                                 random_state=cv_config.get("SEED"))
+        elif model_name == "SVC": 
+            return SVC(probability=True, 
+                   class_weight="balanced", 
+                   random_state=cv_config.get("SEED"))
+        elif model_name == "RF": 
+            return RandomForestClassifier(random_state=cv_config.get("SEED"),
+                                     n_estimators=5,
+                                     max_depth=3,
+                                     n_jobs=-1)
+        elif model_name == "KNN": 
+            return KNeighborsClassifier(n_jobs=-1)        
+        else:
+            raise ValueError(f"{model_name} is not a valid estimator name.")
+
 cv_config = {
-    "N_REPEATS": 10,
+    "N_REPEATS": 2,
     "N_SPLITS": 10,
     "SHUFFLE": True,
     "SCALE": False,
     "CALIBRATE": True,
     "CALIB_FRACTION": 0.15,
+    "SEED": 1303
 }
 
-for i, (train_idx, val_idx) in enumerate(outer_cv.split(X, y)):
-    Xtemp, ytemp = X.iloc[train_idx, :], y.iloc[train_idx]
-    Xval, yval = X.iloc[val_idx, :], y.iloc[val_idx]
-
-    if cv_config.get("CALIBRATE"):
-        Xtrain, Xcal, ytrain, ycal = train_test_split(Xtemp, ytemp, 
-                                test_size=cv_config.get("CALIB_FRACTION"), 
-                                stratify=ytemp, 
-                                random_state=settings.misc.seed)
-    else:
-        Xtrain, ytrain = Xtemp, ytemp
-    
-    model = get_model(seed=settings.misc.seed)
-
-    # select features
-    feat_subset = select_features(Xtrain, ytrain, 
-                                selector=get_feature_selector(settings.misc.seed), 
-                                cv=inner_cv,
-                                scale=False,
-                                frac=1.25)
-    features.append(feat_subset)
-
-    if cv_config.get("SCALE"):
-        numeric_cols = Xtrain.select_dtypes(include=[np.float64, np.int64]).columns.tolist()
-        scaler = StandardScaler()
-        Xtrain.loc[:, numeric_cols] = scaler.fit_transform(Xtrain.loc[:, numeric_cols])
-        Xtest.loc[:, numeric_cols] = scaler.transform(Xtest.loc[:, numeric_cols])            
-
-    model.fit(Xtrain[feat_subset], ytrain)
-
-    if cv_config.get("CALIBRATE"):
-        model = get_calibrated_model(model, 
-                                    X=Xcal.loc[:, feat_subset],
-                                    y=ycal)
-    
-    models.append(model)
-
-    # accumulate ground-truths
-    gts_train.append(ytrain)
-    gts_val.append(yval)
-
-    # accumulate predictions
-    train_pred_probas = model.predict_proba(Xtrain)[:, 1]
-    val_pred_probas = model.predict_proba(Xval)[:, 1]
-
-    probas_train.append(train_pred_probas)
-    probas_val.append(val_pred_probas)      
+# define outer and inner cross-validation schemes
+outer_cv = RepeatedStratifiedKFold(n_repeats=cv_config.get("N_REPEATS"),
+                                   n_splits=cv_config.get("N_SPLITS"),
+                                   random_state=cv_config.get("SEED"))
+
+inner_cv = StratifiedKFold(n_splits=cv_config.get("N_SPLITS"),
+                           shuffle=cv_config.get("SHUFFLE"),
+                           random_state=cv_config.get("SEED"))
+
+# define models names, factory and cv results dictionary
+models_names = ["EBM", "LR", "SVC", "RF", "KNN"]
+model_factory = BinaryModelFactory()
+cv_results = dict()
+
+for model_name in models_names:
+    print(f"Processing model {model_name}\n")
+
+    gts_train = []
+    gts_val = []
+    probas_train = []
+    probas_val = []
+    gts_train_conc = []
+    gts_val_conc = []
+    probas_train_conc = []
+    probas_val_conc = []
+
+    models = []
+    errors = []
+    correct = []
+    features = []
+        
+    for i, (train_idx, val_idx) in enumerate(outer_cv.split(X, y)):
+        Xtemp, ytemp = X.iloc[train_idx, :], y.iloc[train_idx]
+        Xval, yval = X.iloc[val_idx, :], y.iloc[val_idx]
+
+        if cv_config.get("CALIBRATE"):
+            Xtrain, Xcal, ytrain, ycal = train_test_split(Xtemp, ytemp,
+                                                          test_size=cv_config.get(
+                                                              "CALIB_FRACTION"),
+                                                          stratify=ytemp,
+                                                          random_state=cv_config.get("SEED"))
+        else:
+            Xtrain, ytrain = Xtemp, ytemp
+
+        model = model_factory.get_model(model_name)
+
+        # select features
+        feat_subset = select_features(Xtrain, ytrain,
+                                      selector=get_feature_selector(
+                                          cv_config.get("SEED")),
+                                      cv=inner_cv,
+                                      scale=False,
+                                      frac=1.25)
+        features.append(feat_subset)
+
+        if cv_config.get("SCALE"):
+            numeric_cols = Xtrain.select_dtypes(
+                include=[np.float64, np.int64]).columns.tolist()
+            scaler = StandardScaler()
+            Xtrain.loc[:, numeric_cols] = scaler.fit_transform(
+                Xtrain.loc[:, numeric_cols])
+            Xtest.loc[:, numeric_cols] = scaler.transform(
+                Xtest.loc[:, numeric_cols])
+
+        model.fit(Xtrain.loc[:, feat_subset], ytrain)
+
+        if cv_config.get("CALIBRATE"):
+            model = get_calibrated_model(model,
+                                         X=Xcal.loc[:, feat_subset],
+                                         y=ycal)
+
+        models.append(model)
+
+        # accumulate ground-truths
+        gts_train.append(ytrain)
+        gts_val.append(yval)
+
+        # accumulate predictions
+        train_pred_probas = model.predict_proba(Xtrain.loc[:, feat_subset])[:, 1]
+        val_pred_probas = model.predict_proba(Xval.loc[:, feat_subset])[:, 1]
+
+        probas_train.append(train_pred_probas)
+        probas_val.append(val_pred_probas)
+
+        # identify correct and erroneous predictions according to the
+        # classification cut-off that maximizes the Youden's J index
+        fpr, tpr, thresholds = roc_curve(ytrain, train_pred_probas)
+        idx = np.argmax(tpr - fpr)
+        youden = thresholds[idx]
+
+        labels_val = np.where(val_pred_probas >= youden, 1, 0)
+
+        # indexes of validation instances misclassified by the model
+        error_idxs = Xval[(yval != labels_val)].index
+        errors.append(error_idxs)
+
+        # indexes of correct predictions
+        correct.append(Xval[(yval == labels_val)].index)
+
+    # CV results for current model
+    curr_est_results = CVModellingOutput(
+        gts_train=gts_train,
+        gts_val=gts_val,
+        probas_train=probas_train,
+        probas_val=probas_val,
+        gts_train_conc=np.concatenate(gts_train),
+        gts_val_conc=np.concatenate(gts_val),
+        probas_train_conc=np.concatenate(probas_train),
+        probas_val_conc=np.concatenate(probas_val),
+        models=models,
+        errors=errors,
+        correct=correct,
+        features=features
+    )
     
-    # identify correct and erroneous predictions according to the 
-    # classification cut-off that maximizes the Youden's J index
-    fpr, tpr, thresholds = roc_curve(ytrain, train_pred_probas)
-    idx = np.argmax(tpr - fpr)
-    youden = thresholds[idx]
-
-    labels_val = np.where(val_pred_probas >= youden, 1, 0)
+    cv_results[model_name] = curr_est_results
+```
 
-    # indexes of validation instances misclassified by the model
-    error_idxs = Xval[(yval != labels_val)].index
-    errors.append(error_idxs)
-    
-    # indexes of correct predictions
-    correct.append(Xval[(yval == labels_val)].index)
-    
-    
-cv_results = CVModellingOutput(
-    gts_train = np.array(gts_train),
-    gts_val = np.array(gts_val),
-    probas_train = np.array(probas_train),
-    probas_val = np.array(probas_val),
-    gts_train_conc = np.concatenate(gts_train),
-    gts_val_conc = np.concatenate(gts_val),
-    probas_train_conc = np.concatenate(probas_train),
-    probas_val_conc = np.concatenate(probas_val),
-    models = models,
-    errors = np.array(errors),
-    correct = np.array(correct),
-    features = np.array(features)
+# Plot the average receiver-operating characteristic (ROC) curves
+```python
+from modelsight.curves import (
+    average_roc_curves, 
+    roc_comparisons, 
+    add_annotations
 )
 
-# Plot the average receiver-operating characteristic (ROC) curve
-model_names = ["EBM"]
+model_names = list(cv_results.keys())
 alpha = 0.05
 alph_str = str(alpha).split(".")[1]
 alpha_formatted = f".{alph_str}"
 roc_symbol = "*"
+palette = [Colors.green2, Colors.blue, Colors.yellow, Colors.violet, Colors.darksalmon]
 n_boot = 100
-kwargs = dict()
 
 f, ax = plt.subplots(1, 1, figsize=(8, 8))
 
-f, ax, barplot, bars, aucs_cis = average_roc_curves(cv_results,
+kwargs = dict()
+
+f, ax, barplot, bars, all_data = average_roc_curves(cv_results,
                                                     colors=palette,
                                                     model_keys=model_names,
                                                     show_ci=True,
                                                     n_boot=n_boot,
                                                     bars_pos=[
-                                                        0.5, 0.01, 0.4, 0.1*len(model_names)],
-                                                    random_state=settings.misc.seed,
+                                                        0.3, 0.01, 0.6, 0.075*len(model_names)],
+                                                    random_state=cv_config.get("SEED"),
                                                     ax=ax,
                                                     **kwargs)
+
+roc_comparisons_results = roc_comparisons(cv_results, "EBM")
+
+kwargs = dict(space_between_whiskers = 0.07)
+order = [
+    ("EBM", "RF"),
+    ("EBM", "SVC"),
+    ("EBM", "LR"),
+    ("EBM", "KNN")
+]
+ax_annot = add_annotations(roc_comparisons_results, 
+                  alpha = 0.05, 
+                  bars=bars, 
+                  direction = "vertical",
+                  order = order,
+                  symbol = roc_symbol,
+                  symbol_fontsize = 30,
+                  voffset = -0.05,
+                  ext_voffset=0,
+                  ext_hoffset=0,
+                  ax=barplot,
+                  **kwargs)
 ```
+This produces the following plot:
+<p align="center">
+<img src='https://github.com/francescopisu/modelsight/assets/14362976/a948c6e6-ee49-4754-aa64-4a43ffe4050c' alt="Average ROC plot" width="600"/>
+</p>
+
+### Assess calibration of the ExplaianbleBoostingClassifier across 
+We will now compute median Brier score (95% CI) of the ExplainableBoostingClassifier and use it to annotate the calibration plot.
+```python
+from sklearn.metrics import brier_score_loss
+from modelsight.calibration import hosmer_lemeshow_plot
+
+briers = []
+for gt, preds in zip(cv_results["EBM"].gts_val, cv_results["EBM"].probas_val):
+    brier = brier_score_loss(gt, preds)
+    briers.append(brier)
+
+brier_low, brier_med, brier_up = np.percentile(briers, [2.5, 50, 97.5])
+
+brier_annot = f"{brier_med:.2f} ({brier_low:.2f} - {brier_up:.2f})"
+
+f, ax = plt.subplots(1, 1, figsize=(11,6))
+
+f, ax = hosmer_lemeshow_plot(cv_results["EBM"].gts_val_conc,
+                             cv_results["EBM"].probas_val_conc,
+                             n_bins=10,
+                             colors=(Colors.darksalmon, Colors.green2),
+                             annotate_bars=True,
+                             title="",
+                             brier_score_annot=brier_annot,
+                             ax=ax
+                             )
+```
+This produces the following plot:
+<p align="center">
+<img src='https://github.com/francescopisu/modelsight/assets/14362976/af435cb6-cd88-4b00-996a-468f59c525c0' alt="Average ROC plot" width="600"/>
+</p>
+
 
 ## :gift_heart: Contributing
 

From 9fc34fd5276d33c920653b847d112dc03c9ba92d Mon Sep 17 00:00:00 2001
From: Francesco Pisu <frapisu9@gmail.com>
Date: Sat, 23 Sep 2023 19:00:42 +0200
Subject: [PATCH 2/2] docs: improve docstring docs #13

---
 src/modelsight/_typing.py           |  36 +++-
 src/modelsight/calibration/calib.py |  42 +++--
 src/modelsight/curves/_delong.py    | 259 ++++++++++++++++------------
 src/modelsight/curves/compare.py    | 245 ++++++++++++++++++++++----
 4 files changed, 429 insertions(+), 153 deletions(-)

diff --git a/src/modelsight/_typing.py b/src/modelsight/_typing.py
index e9ef2e4..a87c1ba 100644
--- a/src/modelsight/_typing.py
+++ b/src/modelsight/_typing.py
@@ -1,3 +1,7 @@
+"""
+This file deals with the implementation of custom types.
+"""
+
 import sys
 import random
 import numpy as np
@@ -24,7 +28,37 @@
 
 
 @dataclass
-class CVModellingOutput:    
+class CVModellingOutput:
+    """This class stores the data generated by a cross-validation
+    process for a single estimator.
+    
+    Arguments
+    ---------
+    gts_train: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing training ground-truth.
+    gts_val: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing validation ground-truth.
+    gts_train_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled training ground-truth.
+    gts_val_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled validation ground-truth.
+    probas_train: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing training predicted probabilities.
+    probas_val: ArrayLike
+        A (n_repetitions * n_outer_splits) list of arrays representing validation predicted probabilities.
+    probas_train_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled training predicted probabilities.
+    probas_val_conc: ArrayLike
+        A list of (n_repetitions * n_outer_splits) data points representing pooled validation predicted probabilities.
+    models: List[Estimator]
+        A list of (n_repetitions * n_outer_splits) fitted estimators.
+    errors: Optional[ArrayLike]
+        A (n_repetitions * n_outer_splits) list of validation prediction errors.
+    correct: Optional[ArrayLike]
+        A (n_repetitions * n_outer_splits) list of validation correct predictions.
+    features: Optional[ArrayLike]
+        A (n_repetitions * n_outer_splits) list of subsets of selected features.
+    """
     gts_train: ArrayLike
     gts_val: ArrayLike
     gts_train_conc: ArrayLike
diff --git a/src/modelsight/calibration/calib.py b/src/modelsight/calibration/calib.py
index ba9cbd4..37f9a60 100644
--- a/src/modelsight/calibration/calib.py
+++ b/src/modelsight/calibration/calib.py
@@ -1,3 +1,8 @@
+"""
+This file deals with the implementation of the Hosmer-Lemeshow plot for the
+assessment of calibration of predicted probabilities.
+"""
+
 import numpy as np
 from typing import Tuple
 import matplotlib.pyplot as plt
@@ -6,11 +11,14 @@
 
 
 def ntile_name(n: int) -> str:
-    """Returns the ntile name corresponding to an ntile integer.
+    """
+    Returns the ntile name corresponding to an ntile integer.
+    
     Parameters
     ----------
     n : int
         An ntile integer.
+    
     Returns
     -------
     ntile_name : str
@@ -30,13 +38,16 @@ def ntile_name(n: int) -> str:
 
 def make_recarray(y_true: ArrayLike,
                   y_pred: ArrayLike) -> np.recarray:
-    """Combines arrays into a recarray.
+    """
+    Combines arrays into a recarray.
+    
     Parameters
     ----------
     y_true : array
         Observed labels, either 0 or 1.
     y_pred : array
         Predicted probabilities, floats on [0, 1].
+    
     Returns
     -------
     table : recarray
@@ -53,7 +64,9 @@ def make_recarray(y_true: ArrayLike,
 def hosmer_lemeshow_table(y_true: ArrayLike,
                           y_pred: ArrayLike,
                           n_bins: int = 10) -> np.recarray:
-    """Constructs a Hosmer–Lemeshow table.
+    """
+    Constructs a Hosmer–Lemeshow table.
+    
     Parameters
     ----------
     y_true : array
@@ -63,6 +76,7 @@ def hosmer_lemeshow_table(y_true: ArrayLike,
     n_bins : int, optional
         The number of groups to create. The default value is 10, which
         corresponds to deciles of predicted probabilities.
+    
     Returns
     -------
     table : recarray
@@ -100,26 +114,28 @@ def hosmer_lemeshow_plot(y_true: ArrayLike,
 
     Parameters
     ----------
-    y_true: ArrayLike
+    y_true : ArrayLike
         (n_obs,) shaped array of ground-truth values
-    y_pred: ArrayLike
+    y_pred : ArrayLike
         (n_obs,) shaped array of predicted probabilities
-    n_bins: int
+    n_bins : int
         Number of bins to group observed and predicted probabilities into
-    colors: Tuple[str, str]
+    colors : Tuple[str, str]
         Pair of colors for observed (line) and predicted (vertical bars) probabilities.
-    annotate_bars: bool
+    annotate_bars : bool
         Whether bars should be annotated with the number of observed probabilities in each bin.
-    title: str
+    title : str
         Title to display on top of the calibration plot.
-    brier_score_annot: str
+    brier_score_annot : str
         Optional brier score (95% CI) annotation on the top-left corner.
-    ax: plt.Axes
+    ax : plt.Axes
         A matplotlib Axes object to draw the calibration plot into. If None, an Axes object is created by default.
+    
     Returns
     -------
-    Tuple[plt.Figure, plt.Axes]:
-        Corresponding figure and Axes
+    f, ax : Tuple[plt.Figure, plt.Axes]
+        f: pyplot figure
+        ax: pyplot Axes
     """
     table = hosmer_lemeshow_table(y_true, y_pred, n_bins)
     # transform observed and predicted frequencies in percentage relative to the bin dimension
diff --git a/src/modelsight/curves/_delong.py b/src/modelsight/curves/_delong.py
index 77dd1f3..bdffa34 100644
--- a/src/modelsight/curves/_delong.py
+++ b/src/modelsight/curves/_delong.py
@@ -1,124 +1,169 @@
+"""
+This file deals with the implementation of the DeLong test for the comparison of
+pairs of correlated areas under the receiver-operating characteristics curves.
+"""
+
 import pandas as pd
 import numpy as np
 import scipy.stats
+from typing import Tuple
 
 # AUC comparison adapted from
 # https://github.com/Netflix/vmaf/
-def compute_midrank(x):
-    """Computes midranks.
-    Args:
-       x - a 1D numpy array
-    Returns:
-       array of midranks
-    """
-    J = np.argsort(x)
-    Z = x[J]
-    N = len(x)
-    T = np.zeros(N, dtype=np.float64)
-    i = 0
-    while i < N:
-        j = i
-        while j < N and Z[j] == Z[i]:
-            j += 1
-        T[i:j] = 0.5*(i + j - 1)
-        i = j
-    T2 = np.empty(N, dtype=np.float64)
-    # Note(kazeevn) +1 is due to Python using 0-based indexing
-    # instead of 1-based in the AUC formula in the paper
-    T2[J] = T + 1
-    return T2
+def compute_midrank(x: np.ndarray) -> np.ndarray:
+   """
+   Computes midranks.
+    
+   Parameters
+   ----------
+   x : np.ndarray
+     a 1-d array of predicted probabilities.
+    
+   Returns
+   -------
+   T2 : np.ndarray
+      array of midranks
+   """
+   J = np.argsort(x)
+   Z = x[J]
+   N = len(x)
+   T = np.zeros(N, dtype=np.float64)
+   i = 0
+   while i < N:
+      j = i
+      while j < N and Z[j] == Z[i]:
+         j += 1
+      T[i:j] = 0.5*(i + j - 1)
+      i = j
+   T2 = np.empty(N, dtype=np.float64)
+   # Note(kazeevn) +1 is due to Python using 0-based indexing
+   # instead of 1-based in the AUC formula in the paper
+   T2[J] = T + 1
+   return T2
 
 
-def fastDeLong(predictions_sorted_transposed, label_1_count):
-    """
-    The fast version of DeLong's method for computing the covariance of
-    unadjusted AUC.
-    Args:
-       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
-          sorted such as the examples with label "1" are first
-    Returns:
-       (AUC value, DeLong covariance)
-    Reference:
-     @article{sun2014fast,
-       title={Fast Implementation of DeLong's Algorithm for
-              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
-       author={Xu Sun and Weichao Xu},
-       journal={IEEE Signal Processing Letters},
-       volume={21},
-       number={11},
-       pages={1389--1393},
-       year={2014},
-       publisher={IEEE}
-     }
-    """
-    # Short variables are named as they are in the paper
-    m = label_1_count
-    n = predictions_sorted_transposed.shape[1] - m
-    positive_examples = predictions_sorted_transposed[:, :m]
-    negative_examples = predictions_sorted_transposed[:, m:]
-    k = predictions_sorted_transposed.shape[0]
+def fastDeLong(predictions_sorted_transposed: np.ndarray, 
+               label_1_count: int) -> Tuple[np.ndarray, np.ndarray]:
+   """
+   The fast version of DeLong's method for computing the covariance of
+   unadjusted AUC.
+   
+   Parameters
+   ----------
+   predictions_sorted_transposed : a (n_classifiers, n_obs) numpy array containing
+      the predicted probabilities by the two classifiers in the comparison. 
+      These probabilities are sorted such that the examples with label "1" come first.
+   
+   Returns
+   -------
+   aucs, delongcov : Tuple[np.ndarray, np.ndarray]
+      aucs: array of AUC values 
+      delongcov: array of DeLong covariance
+      
+   Reference
+   ---------
+   @article{sun2014fast,
+      title={Fast Implementation of DeLong's Algorithm for
+            Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
+      author={Xu Sun and Weichao Xu},
+      journal={IEEE Signal Processing Letters},
+      volume={21},
+      number={11},
+      pages={1389--1393},
+      year={2014},
+      publisher={IEEE}
+   }
+   """
+   # Short variables are named as they are in the paper
+   m = label_1_count
+   n = predictions_sorted_transposed.shape[1] - m
+   positive_examples = predictions_sorted_transposed[:, :m]
+   negative_examples = predictions_sorted_transposed[:, m:]
+   k = predictions_sorted_transposed.shape[0]
 
-    tx = np.empty([k, m], dtype=np.float64)
-    ty = np.empty([k, n], dtype=np.float64)
-    tz = np.empty([k, m + n], dtype=np.float64)
-    for r in range(k):
-        tx[r, :] = compute_midrank(positive_examples[r, :])
-        ty[r, :] = compute_midrank(negative_examples[r, :])
-        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
-    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
-    v01 = (tz[:, :m] - tx[:, :]) / n
-    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
-    sx = np.cov(v01)
-    sy = np.cov(v10)
-    delongcov = sx / m + sy / n
-    return aucs, delongcov
+   tx = np.empty([k, m], dtype=np.float64)
+   ty = np.empty([k, n], dtype=np.float64)
+   tz = np.empty([k, m + n], dtype=np.float64)
+   for r in range(k):
+      tx[r, :] = compute_midrank(positive_examples[r, :])
+      ty[r, :] = compute_midrank(negative_examples[r, :])
+      tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
+   aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
+   v01 = (tz[:, :m] - tx[:, :]) / n
+   v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
+   sx = np.cov(v01)
+   sy = np.cov(v10)
+   delongcov = sx / m + sy / n
+   return aucs, delongcov
 
 
-def calc_pvalue(aucs, sigma):
-    """Computes log(10) of p-values.
-    Args:
-       aucs: 1D array of AUCs
-       sigma: AUC DeLong covariances
-    Returns:
-       log10(pvalue)
-    """
-    l = np.array([[1, -1]])
-    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
-    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
+def calc_pvalue(aucs: np.ndarray, sigma: np.ndarray) -> float:
+   """
+   Computes log(10) of p-values.
+   
+   Parameters
+   ----------
+   aucs : np.array
+      a 1-d array of AUCs
+   sigma : np.array
+      an array AUC DeLong covariances
+   
+   Returns
+   -------
+   p : float
+      log10(pvalue)
+   """
+   l = np.array([[1, -1]])
+   z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
+   p = np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
+   return p
 
 
-def compute_ground_truth_statistics(ground_truth):
-    assert np.array_equal(np.unique(ground_truth), [0, 1])
-    order = (-ground_truth).argsort()
-    label_1_count = int(ground_truth.sum())
-    return order, label_1_count
+def compute_ground_truth_statistics(ground_truth: np.ndarray) -> Tuple[np.ndarray, int]:
+   """
+   Compute statistics of ground-truth array.
 
+   Parameters
+   ----------
+   ground_truth : np.ndarray
+      a (n_obs,) array of 0 and 1 values representing the ground-truth.
 
-def delong_roc_variance(ground_truth, predictions):
-    """
-    Computes ROC AUC variance for a single set of predictions
-    Args:
-       ground_truth: np.array of 0 and 1
-       predictions: np.array of floats of the probability of being class 1
-    """
-    order, label_1_count = compute_ground_truth_statistics(ground_truth)
-    predictions_sorted_transposed = predictions[np.newaxis, order]
-    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
-    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
-    return aucs[0], delongcov
+   Returns
+   -------
+   order, label_1_count : Tuple[np.ndarray, int]
+       order is a numpy array of sorted indexes
+       label_1_count is the count of data points of the positive class.
+   """
+   assert np.array_equal(np.unique(ground_truth), [0, 1])
+   order = (-ground_truth).argsort()
+   label_1_count = int(ground_truth.sum())
+   return order, label_1_count
 
 
-def delong_roc_test(ground_truth, predictions_one, predictions_two):
-    """
-    Computes log(p-value) for hypothesis that two ROC AUCs are different
-    Args:
-       ground_truth: np.array of 0 and 1
-       predictions_one: predictions of the first model,
-          np.array of floats of the probability of being class 1
-       predictions_two: predictions of the second model,
-          np.array of floats of the probability of being class 1
-    """
-    order, label_1_count = compute_ground_truth_statistics(ground_truth)
-    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
-    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
-    return 10**calc_pvalue(aucs, delongcov).item()
\ No newline at end of file
+def delong_roc_test(ground_truth: np.ndarray, 
+                    predictions_one: np.ndarray, 
+                    predictions_two: np.ndarray) -> float:
+   """
+   Compare areas-under-curve of two estimators using the DeLong test.
+   Concretely, it computes the pvalue for hypothesis that two ROC AUCs are different.
+   
+   Parameters
+   ----------
+   ground_truth : np.ndarray
+      a (n_obs,) array of 0 and 1 representing ground-truths.
+   predictions_one : np.ndarray
+      a (n_obs,) array of probabilities of class 1 predicted by the first model.
+   predictions_two : np.ndarray
+      a (n_obs,) array of probabilities of class 1 predicted by the second model.
+      
+   Returns
+   -------
+   p : float
+      the p-value for hypothesis that two ROC AUCs are different.
+   """
+   order, label_1_count = compute_ground_truth_statistics(ground_truth)
+   predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
+   aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
+   
+   p = 10**calc_pvalue(aucs, delongcov).item()
+   return p
\ No newline at end of file
diff --git a/src/modelsight/curves/compare.py b/src/modelsight/curves/compare.py
index 4d748ec..aa298b8 100644
--- a/src/modelsight/curves/compare.py
+++ b/src/modelsight/curves/compare.py
@@ -1,37 +1,84 @@
+"""
+This file deals with the implementation of functions that allow annotating plots 
+with statistical tests results between pairs of estimators.
+"""
+
 from typing import Callable, Dict, Tuple, List
 import matplotlib
 from matplotlib import patches
 import matplotlib.pyplot as plt
-from scipy.stats import ttest_ind
-from sklearn.metrics import average_precision_score
 
 from src.modelsight.curves._delong import delong_roc_test
+from src.modelsight._typing import CVModellingOutput
 
-def annot_stat_vertical(text, x, y1, y2, ww, 
-                        col='k', 
-                        fontsize=13, 
-                        voffset = 0, 
-                        n_elems = None,
+def annot_stat_vertical(text:str, 
+                        x: float, 
+                        y1: float, y2: float, 
+                        ww: float = 0.02, 
+                        col: str = 'black', 
+                        fontsize: int = 13, 
+                        voffset: float = 0, 
+                        n_elems: int = None,
                         ax=None,
                         **kwargs):
     """
-    ww: float
-        whisker width
+    Draw a vertical whisker at position `x` that spans through `y1` to `y2` with annotation specified
+    by `text`.
+
+    Parameters
+    ----------
+    text : str
+        Annotation for whisker.
+    x : float
+        x-position the whisker is positioned at.
+    y1 :float
+        starting y position.
+    y2 : float
+        ending y position.
+    ww : float, optional
+        whisker width, by default 0.02
+    col : str, optional
+        whisker color, by default 'black'
+    fontsize : int, optional
+        fontsize for the annotation, by default 13
+    voffset : float, optional
+        vertical offset for the annotation, by default 0.
+        Some font families and characters occupy different vertical spaces; 
+        this parameter allows compensating for such variations.
+    n_elems : int, optional
+        number of discrete elements in the y-axis, by default None.
+        This value is precomputed by the caller (add_annotations) and passed
+        to this function as input.
+    ax : plt.Axes, optional
+        a pyplot Axes to draw annotations on, by default None
+    **kwargs
+        rect_h_base: float, optional
+            base height of rectangle patch for single-character annotations, by default 0.1
+        fontsize_nonsignif, optional
+            fontsize for multi-character annotations (here called non significant annotations
+            to reflect the fact that single-character annotations most often use some kind
+            of symbol to denote statistical significance, e.g. *), by default `fontsize` (i.e., 13)
     """
     ax = plt.gca() if ax is None else ax
     
     # we want the text to be centered on the whisker 
     text_x_pos = x + ww 
-    #+ 0.01
+
     text_y_pos = (y1+y2)/2
     
     # draw whisker from y1 to y2 with width `ww`
     ax.plot([x, x + ww, x + ww, x], [y1, y1, y2, y2], lw=1, c=col)
     
-    if len(text) == 1:                
-        #text_y_pos = (y1+y2)/2
-
-        # draw text at (text_x_pos, text_y_pos) # + 0.15
+    # this is the case of a whisker being annotated with a single character.
+    # by default, symbols do not enforce a white background, hence when
+    # superimposed on whiskers the readibility is limited.
+    # here we enforce a white rectangle patch beneath the symbol to enhance
+    # readibility of annotations.
+    # the built-in bbox parameter of pyplot's .text() doesn't produce
+    # acceptable results, hence we came up with a custom implementation for
+    # single-character annotations.
+    if len(text) == 1:
+        # draw text at (text_x_pos, (text_y_pos - voffset) + 0.17)
         ax.text(
             text_x_pos, (text_y_pos - voffset) + 0.17, text, 
             ha='center', va='center', color=col,
@@ -61,6 +108,9 @@ def annot_stat_vertical(text, x, y1, y2, ww,
         
         ax.add_patch(rect)
     else:
+        # this is the case of multi-character annotations.
+        # here, we leverage the built-in bbox of pyplot's text method
+        # that allows drawing a bounding box beneath the annotation.
         fontsize_nonsignif = kwargs.pop("fontsize_nonsignif", fontsize)
         ax.text(
             text_x_pos, text_y_pos, text, 
@@ -73,15 +123,52 @@ def annot_stat_vertical(text, x, y1, y2, ww,
             )
         )        
         
-from matplotlib import patches
-def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13, 
-                        voffset = 0, 
-                        n_elems = None,
-                        ax=None,
-                        **kwargs):
+
+def annot_stat_horizontal(text: str, 
+                          x1: float, x2: float, 
+                          y: float, 
+                          wh: float = 0.02, 
+                          col: str = "black", 
+                          fontsize: int = 13, 
+                          voffset: float = 0, 
+                          n_elems:int  = None,
+                          ax: plt.Axes = None,
+                          **kwargs):
     """
-    ww: float
-        whisker width
+    Draw an horizontal whisker at position `y` that spans through `x1` to `x2` with annotation specified
+    by `text`.
+
+    Parameters
+    ----------
+    text : str
+        Annotation for whisker.
+    x1 : float
+        starting x position.
+    x2 :float
+        ending x position.
+    y : float
+        y-position the whisker is positioned at.
+    wh : float, optional
+        whisker height, by default 0.02
+    col : str, optional
+        whisker color, by default 'black'
+    fontsize : int, optional
+        fontsize for the annotation, by default 13
+    voffset : float, optional
+        vertical offset for the annotation, by default 0.
+        Some font families and characters occupy different vertical spaces; 
+        this parameter allows compensating for such variations.
+    n_elems : int, optional
+        number of discrete elements in the y-axis, by default None.
+        This value is precomputed by the caller (add_annotations) and passed
+        to this function as input.
+    ax : plt.Axes, optional
+        a pyplot Axes to draw annotations on, by default None
+    **kwargs
+        fontsize_nonsignif, optional
+            fontsize for multi-character annotations (here called non significant annotations
+            to reflect the fact that single-character annotations most often use some kind
+            of symbol to denote statistical significance, e.g. *), by default `fontsize` (i.e., 13)
     """
     ax = plt.gca() if ax is None else ax
     
@@ -93,10 +180,16 @@ def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13,
     # draw whisker from y1 to y2 with width `ww`
     ax.plot([x1, x1, x2, x2], [y, y + wh, y + wh, y], lw=1, c=col,
            clip_on=False)
-    
-    if len(text) == 1:     
-        #text_y_pos = (y1+y2)/2
 
+    # this is the case of a whisker being annotated with a single character.
+    # by default, symbols do not enforce a white background, hence when
+    # superimposed on whiskers the readibility is limited.
+    # here we enforce a white rectangle patch beneath the symbol to enhance
+    # readibility of annotations.
+    # the built-in bbox parameter of pyplot's .text() doesn't produce
+    # acceptable results, hence we came up with a custom implementation for
+    # single-character annotations.    
+    if len(text) == 1:
         # draw text at (text_x_pos, text_y_pos) # + 0.15
         ax.text(
             text_x_pos, text_y_pos + voffset, text, 
@@ -140,14 +233,12 @@ def annot_stat_horizontal(text, x1, x2, y, wh, col='k', fontsize=13,
         )        
         
         
-from typing import Tuple, List, Dict
-
 def add_annotations(comparisons: Dict[str, Tuple[str, str, float]], 
                     alpha: float, 
                     bars: matplotlib.container.BarContainer,
                     direction: str,
                     order: List[Tuple[str, str]],
-                    symbol: str,
+                    symbol: str = "*",
                     symbol_fontsize: int = 22,
                     voffset: float = 0,
                     ext_voffset: float = 0,
@@ -155,6 +246,54 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]],
                     P_val_rounding: int = 2,
                     ax: plt.Axes = None,
                     **kwargs):
+    """
+    Annotates the specified plot (`ax`) with the provided comparisons results either vertically or horizontally
+    depending on the value of `direction`.
+
+    Parameters
+    ----------
+    comparisons : Dict[str, Tuple[str, str, float]]
+        The results of models comparisons.
+    alpha : float
+        The significance level used for formatting the P value of comparisons.
+    bars : matplotlib.container.BarContainer
+        A list of matplotlib's bars that is used to access the bar's width or height
+        when annotating horizontally and vertically, respectively.
+    direction : str
+        The direction for annotation. Possible values are "horizontal" and "vertical".
+    order : List[Tuple[str, str]]
+        The order in which the comparisons should be displayed.
+        Each entry of this list is a tuple where elements are algorithm's names.
+    symbol : str, optional
+        The symbol used in place of the P value when statistical significance is achieved
+        accoring to the specified alpha, by default "*".
+    symbol_fontsize : int, optional
+        Fontsize for the symbol used when statistical significance is achieved, by default 22
+    voffset : float, optional
+        vertical offset for the annotation, by default 0., by default 0
+    ext_voffset : float, optional
+        Additional vertical offset for vertical annotations.
+        Ignored when direction = "horizontal", by default 0
+    ext_hoffset : float, optional
+        Additional horizontal offset for horizontal annotations.
+        Ignored when direction = "vertical", by default 0
+    P_val_rounding : int, optional
+        Number of decimal places to round P values at, by default 2
+    ax : plt.Axes, optional
+        The plot to be annotated, by default None
+
+    Returns
+    -------
+    ax : plt.Axes
+        The annotated plot.
+
+    Raises
+    ------
+    ValueError
+        When ax is None
+    ValueError
+        Whenever a comparison key doesn't exist.
+    """
     if not ax:
         raise ValueError("I need an Axes to draw comparisons on.")
     
@@ -188,7 +327,7 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]],
                                 wh=0.02,
                                 col="black", 
                                 fontsize=symbol_fontsize,
-                                voffset = -0.02,
+                                voffset = voffset, #-0.02
                                 ext_offset = ext_hoffset,
                                 n_elems = len(entity_labels),
                                 ax=ax,
@@ -219,16 +358,58 @@ def add_annotations(comparisons: Dict[str, Tuple[str, str, float]],
     
     return ax
         
-def roc_single_comparison(cv_preds, fst_algo, snd_algo):
+def roc_single_comparison(cv_preds: CVModellingOutput, 
+                          fst_algo: str, 
+                          snd_algo: str) -> Dict[str, Tuple[str, str, float]]:
+    """Perform a single comparison of two areas under Receiver Operating Characteristic curves
+    computed on the same set of data points by the DeLong test.
+
+    Parameters
+    ----------
+    cv_preds : CVModellingOutput
+        The output of a cross-validation process encompassing mulitple (n>=2) models.
+    fst_algo : str
+        The name of the first algorithm for the comparison.
+        Must be an existing key of `cv_preds`.
+    snd_algo : str
+        The name of the second algorithm for the comparison.
+        Must be an existing key of `cv_preds`.
+
+    Returns
+    -------
+    comparison_result : Dict[str, Tuple[str, str, float]]
+        The output of the comparison. This is a dictionary where the key is
+        of the form "<fst_algo>_<snd_algo>" and the value is a tuple of three
+        elements, the first two are the names of the algorithms being compared
+        and the third element is the P value for the null hypothesis that
+        the two AUC values are equal.
+    """
     ground_truths = cv_preds[fst_algo].gts_val_conc
     fst_algo_probas = cv_preds[fst_algo].probas_val_conc
     snd_algo_probas = cv_preds[snd_algo].probas_val_conc
     
     P = delong_roc_test(ground_truths, fst_algo_probas, snd_algo_probas)
     cmp_key = f"{fst_algo}_{snd_algo}"
-    return {cmp_key: (fst_algo, snd_algo, P)}
+    comparison_result = {cmp_key: (fst_algo, snd_algo, P)}
+    return comparison_result
         
-def roc_comparisons(cv_preds, target_algo):
+def roc_comparisons(cv_preds: CVModellingOutput, 
+                    target_algo: str):
+    """
+    Compares the AUC of the specified algorithm with the AUCs of all other algorithms.
+
+    Parameters
+    ----------
+    cv_preds : CVModellingOutput
+        The output of a cross-validation process encompassing mulitple (n>=2) models.
+    target_algo : str
+        The name of the target algorithm's whose AUC will be compared with all other AUCs.
+
+    Returns
+    -------
+    comparisons : Dict[str, Tuple[str, str, float]]
+        A dictionary containing the results of all comparisons. See output of `roc_single_comparison`.
+    """
     comparisons = dict()
 
     for algo_name in cv_preds.keys():