propagte categorical params to the boosting code

interpretml · Dec 22, 2024 · 57af6a8 · 57af6a8
1 parent f7e26ee
commit 57af6a8
Show file tree

Hide file tree

Showing 11 changed files with 298 additions and 46 deletions.
diff --git a/R/src/interpret_R.cpp b/R/src/interpret_R.cpp
@@ -835,6 +835,9 @@ SEXP GenerateTermUpdate_R(
       0,
       0,
       0,
+      10.0,
+      32,
+      1.0,
       aLeavesMax,
       nullptr,
       &avgGain

diff --git a/python/interpret-core/interpret/develop.py b/python/interpret-core/interpret/develop.py
@@ -13,8 +13,11 @@
     "n_intercept_rounds_initial": 25,
     "n_intercept_rounds_final": 100,
     "intercept_learning_rate": 0.25,
-    "missing_lossguide_continuous": False,
-    "missing_lossguide_nominal": False,
+    "cat_l2": 0.0,  # TODO: change to 10.0 (see Lightgbm cat_l2)
+    "min_samples_leaf_nominal": None,  # TODO: LightGBM uses min_data_per_group = 100
+    "cat_smooth": 10.0,
+    "max_cat_threshold": 32,
+    "cat_include": 0.75,
     "purify_boosting": False,
     "purify_result": False,
     "randomize_initial_feature_order": True,

diff --git a/python/interpret-core/interpret/glassbox/_ebm/_boost.py b/python/interpret-core/interpret/glassbox/_ebm/_boost.py
@@ -28,6 +28,7 @@ def boost(
     reg_alpha,
     reg_lambda,
     max_delta_step,
+    missing,
     max_leaves,
     monotone_constraints,
     greedy_ratio,
@@ -74,9 +75,12 @@ def boost(
                     learning_rate=intercept_learning_rate,
                     min_samples_leaf=0,
                     min_hessian=0.0,
-                    reg_alpha=0.0,
-                    reg_lambda=0.0,
+                    reg_alpha=reg_alpha,
+                    reg_lambda=reg_lambda,
                     max_delta_step=0.0,
+                    cat_smooth=develop.get_option("cat_smooth"),
+                    max_cat_threshold=develop.get_option("max_cat_threshold"),
+                    cat_include=develop.get_option("cat_include"),
                     max_leaves=1,
                     monotone_constraints=None,
                 )
@@ -106,7 +110,6 @@ def boost(
             random_cyclic_ordering = np.arange(len(term_features), dtype=np.int64)
 
             while step_idx < max_steps:
-                term_boost_flags_local = term_boost_flags
                 if state_idx >= 0:
                     # cyclic
                     if state_idx == 0:
@@ -124,27 +127,6 @@ def boost(
 
                     term_idx = random_cyclic_ordering[state_idx]
 
-                    contains_nominals = any(
-                        nominals[i] for i in term_features[term_idx]
-                    )
-
-                    if contains_nominals:
-                        if develop.get_option("missing_lossguide_nominal"):
-                            term_boost_flags_local |= (
-                                Native.TermBoostFlags_MissingLossguide
-                            )
-                    else:
-                        if develop.get_option("missing_lossguide_continuous"):
-                            term_boost_flags_local |= (
-                                Native.TermBoostFlags_MissingLossguide
-                            )
-
-                    if smoothing_rounds > 0 and (
-                        nominal_smoothing or not contains_nominals
-                    ):
-                        # modify some of our parameters temporarily
-                        term_boost_flags_local |= Native.TermBoostFlags_RandomSplits
-
                     make_progress = False
                     if cyclic_state >= 1.0 or smoothing_rounds > 0:
                         # if cyclic_state is above 1.0 we make progress
@@ -156,6 +138,37 @@ def boost(
                     step_idx += 1
                     _, _, term_idx = heapq.heappop(heap)
 
+                contains_nominals = any(nominals[i] for i in term_features[term_idx])
+
+                term_boost_flags_local = term_boost_flags
+                reg_lambda_local = reg_lambda
+                min_samples_leaf_local = min_samples_leaf
+                if contains_nominals:
+                    reg_lambda_local += develop.get_option("cat_l2")
+
+                    if develop.get_option("min_samples_leaf_nominal") is not None:
+                        min_samples_leaf_local = develop.get_option(
+                            "min_samples_leaf_nominal"
+                        )
+
+                if missing == "low":
+                    term_boost_flags_local |= Native.TermBoostFlags_MissingLow
+                elif missing == "high":
+                    term_boost_flags_local |= Native.TermBoostFlags_MissingHigh
+                elif missing == "separate":
+                    term_boost_flags_local |= Native.TermBoostFlags_MissingSeparate
+                elif missing == "drop":
+                    term_boost_flags_local |= Native.TermBoostFlags_MissingDrop
+                elif missing != "gain":
+                    msg = f"Unrecognized missing option {missing}."
+                    raise Exception(msg)
+
+                if smoothing_rounds > 0 and (
+                    nominal_smoothing or not contains_nominals
+                ):
+                    # modify some of our parameters temporarily
+                    term_boost_flags_local |= Native.TermBoostFlags_RandomSplits
+
                 if bestkey is None or state_idx >= 0:
                     term_monotone = None
                     if monotone_constraints is not None:
@@ -169,11 +182,14 @@ def boost(
                         term_idx=term_idx,
                         term_boost_flags=term_boost_flags_local,
                         learning_rate=learning_rate,
-                        min_samples_leaf=min_samples_leaf,
+                        min_samples_leaf=min_samples_leaf_local,
                         min_hessian=min_hessian,
                         reg_alpha=reg_alpha,
-                        reg_lambda=reg_lambda,
+                        reg_lambda=reg_lambda_local,
                         max_delta_step=max_delta_step,
+                        cat_smooth=develop.get_option("cat_smooth"),
+                        max_cat_threshold=develop.get_option("max_cat_threshold"),
+                        cat_include=develop.get_option("cat_include"),
                         max_leaves=max_leaves,
                         monotone_constraints=term_monotone,
                     )

diff --git a/python/interpret-core/interpret/glassbox/_ebm/_ebm.py b/python/interpret-core/interpret/glassbox/_ebm/_ebm.py
@@ -361,6 +361,7 @@ def __init__(
         reg_alpha,
         reg_lambda,
         max_delta_step,
+        missing,
         max_leaves,
         monotone_constraints,
         objective,
@@ -408,6 +409,7 @@ def __init__(
             self.reg_alpha = reg_alpha
             self.reg_lambda = reg_lambda
             self.max_delta_step = max_delta_step
+            self.missing = missing
 
         self.max_leaves = max_leaves
         if not is_private(self):
@@ -936,6 +938,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
             reg_alpha = 0.0
             reg_lambda = 0.0
             max_delta_step = 0.0
+            missing = "low"
             interactions = 0
             monotone_constraints = None
             n_intercept_rounds = 0
@@ -956,6 +959,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
             reg_alpha = self.reg_alpha
             reg_lambda = self.reg_lambda
             max_delta_step = self.max_delta_step
+            missing = self.missing
             interactions = self.interactions
             monotone_constraints = self.monotone_constraints
             n_intercept_rounds = develop.get_option("n_intercept_rounds_initial")
@@ -1072,6 +1076,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
                     reg_alpha,
                     reg_lambda,
                     max_delta_step,
+                    missing,
                     self.max_leaves,
                     monotone_constraints,
                     greedy_ratio,
@@ -1344,6 +1349,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
                         reg_alpha,
                         reg_lambda,
                         max_delta_step,
+                        missing,
                         self.max_leaves,
                         monotone_constraints,
                         greedy_ratio,
@@ -1468,6 +1474,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
                     0.0,
                     0.0,
                     0.0,
+                    missing,
                     1,
                     None,
                     greedy_ratio,
@@ -2764,6 +2771,22 @@ class ExplainableBoostingClassifier(ClassifierMixin, EBMModel):
         L2 regularization.
     max_delta_step : float, default=0.0
         Used to limit the max output of tree leaves. <=0.0 means no constraint.
+    missing: str, default="low"
+
+        Method for handling missing values during boosting. The placement of the missing value bin can influence
+        the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to
+        affect lower bins, and vice versa. This parameter does not affect the final placement
+        of the missing bin in the model (the missing bin will remain at index 0 in the term_scores\_ attribute).
+        Possible values for missing are:
+
+            - `'low'`: Place the missing bin on the left side of the graphs.
+            - `'high'`: Place the missing bin on the right side of the graphs.
+            - `'separate'`: Place the missing bin in its own leaf during each boosting step,
+              effectively making it location-agnostic. This can lead to overfitting, especially
+              when the proportion of missing values is small.
+            - `'drop'`: Ignore the contribution of the missing bin, or split the feature into two leaves based on gain:
+              one for missing values and one for non-missing values.
+            - `'gain'`: Choose the best leaf for the missing value contribution at each boosting step, based on gain.
     max_leaves : int, default=3
         Maximum number of leaves allowed in each tree.
     monotone_constraints: list of int, default=None
@@ -2923,6 +2946,7 @@ def __init__(
         reg_alpha: Optional[float] = 0.0,
         reg_lambda: Optional[float] = 0.0,
         max_delta_step: Optional[float] = 0.0,
+        missing: str = "low",
         max_leaves: int = 3,
         monotone_constraints: Optional[Sequence[int]] = None,
         objective: str = "log_loss",
@@ -2953,6 +2977,7 @@ def __init__(
             reg_alpha=reg_alpha,
             reg_lambda=reg_lambda,
             max_delta_step=max_delta_step,
+            missing=missing,
             max_leaves=max_leaves,
             monotone_constraints=monotone_constraints,
             objective=objective,
@@ -3120,6 +3145,22 @@ class ExplainableBoostingRegressor(RegressorMixin, EBMModel):
         L2 regularization.
     max_delta_step : float, default=0.0
         Used to limit the max output of tree leaves. <=0.0 means no constraint.
+    missing: str, default="low"
+
+        Method for handling missing values during boosting. The placement of the missing value bin can influence
+        the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to
+        affect lower bins, and vice versa. This parameter does not affect the final placement
+        of the missing bin in the model (the missing bin will remain at index 0 in the term_scores\_ attribute).
+        Possible values for missing are:
+
+            - `'low'`: Place the missing bin on the left side of the graphs.
+            - `'high'`: Place the missing bin on the right side of the graphs.
+            - `'separate'`: Place the missing bin in its own leaf during each boosting step,
+              effectively making it location-agnostic. This can lead to overfitting, especially
+              when the proportion of missing values is small.
+            - `'drop'`: Ignore the contribution of the missing bin, or split the feature into two leaves based on gain:
+              one for missing values and one for non-missing values.
+            - `'gain'`: Choose the best leaf for the missing value contribution at each boosting step, based on gain.
     max_leaves : int, default=2
         Maximum number of leaves allowed in each tree.
     monotone_constraints: list of int, default=None
@@ -3279,6 +3320,7 @@ def __init__(
         reg_alpha: Optional[float] = 0.0,
         reg_lambda: Optional[float] = 0.0,
         max_delta_step: Optional[float] = 0.0,
+        missing: str = "low",
         max_leaves: int = 2,
         monotone_constraints: Optional[Sequence[int]] = None,
         objective: str = "rmse",
@@ -3309,6 +3351,7 @@ def __init__(
             reg_alpha=reg_alpha,
             reg_lambda=reg_lambda,
             max_delta_step=max_delta_step,
+            missing=missing,
             max_leaves=max_leaves,
             monotone_constraints=monotone_constraints,
             objective=objective,
@@ -3542,6 +3585,7 @@ def __init__(
             reg_alpha=0.0,
             reg_lambda=0.0,
             max_delta_step=0.0,
+            missing=None,
             max_leaves=max_leaves,
             monotone_constraints=None,
             objective="log_loss",
@@ -3820,6 +3864,7 @@ def __init__(
             reg_alpha=0.0,
             reg_lambda=0.0,
             max_delta_step=0.0,
+            missing=None,
             max_leaves=max_leaves,
             monotone_constraints=None,
             objective="rmse",

diff --git a/python/interpret-core/interpret/utils/_native.py b/python/interpret-core/interpret/utils/_native.py
@@ -33,11 +33,14 @@ class Native:
     TermBoostFlags_PurifyGain = 0x00000001
     TermBoostFlags_DisableNewtonGain = 0x00000002
     TermBoostFlags_DisableCategorical = 0x00000004
-    TermBoostFlags_MissingLossguide = 0x00000008
-    TermBoostFlags_PurifyUpdate = 0x00000010
-    TermBoostFlags_DisableNewtonUpdate = 0x00000020
-    TermBoostFlags_GradientSums = 0x00000040
-    TermBoostFlags_RandomSplits = 0x00000080
+    TermBoostFlags_PurifyUpdate = 0x00000008
+    TermBoostFlags_DisableNewtonUpdate = 0x00000010
+    TermBoostFlags_GradientSums = 0x00000020
+    TermBoostFlags_RandomSplits = 0x00000040
+    TermBoostFlags_MissingLow = 0x00000080
+    TermBoostFlags_MissingHigh = 0x00000100
+    TermBoostFlags_MissingSeparate = 0x00000200
+    TermBoostFlags_MissingDrop = 0x00000400
 
     # CreateInteractionFlags
     CreateInteractionFlags_Default = 0x00000000
@@ -1507,6 +1510,12 @@ def _initialize(self, is_debug):
             ct.c_double,
             # double maxDeltaStep
             ct.c_double,
+            # double categoricalSmoothing
+            ct.c_double,
+            # int64_t maxCategoricalThreshold
+            ct.c_int64,
+            # double categoricalInclusionPercent
+            ct.c_double,
             # int64_t * leavesMax
             ct.c_void_p,
             # MonotoneDirection * direction
@@ -1822,6 +1831,9 @@ def generate_term_update(
         reg_alpha,
         reg_lambda,
         max_delta_step,
+        cat_smooth,
+        max_cat_threshold,
+        cat_include,
         max_leaves,
         monotone_constraints,
     ):
@@ -1837,6 +1849,9 @@ def generate_term_update(
             reg_alpha: L1 regularization.
             reg_lambda: L2 regularization.
             max_delta_step: Used to limit the max output of tree leaves. <=0.0 means no constraint.
+            cat_smooth: Parameter used to determine which categories are included each boosting round and ordering.
+            max_cat_threshold: max number of categories to include each boosting round
+            cat_include: percentage of categories to include in each boosting round
             max_leaves: Max leaf nodes on feature step.
             monotone_constraints: monotone constraints (1=increasing, 0=none, -1=decreasing)
 
@@ -1881,6 +1896,9 @@ def generate_term_update(
             reg_alpha,
             reg_lambda,
             max_delta_step,
+            cat_smooth,
+            max_cat_threshold,
+            cat_include,
             Native._make_pointer(max_leaves_arr, np.int64, is_null_allowed=True),
             Native._make_pointer(monotone_constraints, np.int32, is_null_allowed=True),
             ct.byref(avg_gain),
-Original file line number
+Diff line change
@@ Expand Up / @@ -835,6 +835,9 @@ SEXP GenerateTermUpdate_R( @@
 ,
 ,
 ,
+.0,
+,
+.0,
           aLeavesMax,
           nullptr,
           &avgGain
@@ Expand Down @@