Skip to content

Commit

Permalink
propagte categorical params to the boosting code
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Dec 22, 2024
1 parent f7e26ee commit 57af6a8
Show file tree
Hide file tree
Showing 11 changed files with 298 additions and 46 deletions.
3 changes: 3 additions & 0 deletions R/src/interpret_R.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,9 @@ SEXP GenerateTermUpdate_R(
0,
0,
0,
10.0,
32,
1.0,
aLeavesMax,
nullptr,
&avgGain
Expand Down
7 changes: 5 additions & 2 deletions python/interpret-core/interpret/develop.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
"n_intercept_rounds_initial": 25,
"n_intercept_rounds_final": 100,
"intercept_learning_rate": 0.25,
"missing_lossguide_continuous": False,
"missing_lossguide_nominal": False,
"cat_l2": 0.0, # TODO: change to 10.0 (see Lightgbm cat_l2)
"min_samples_leaf_nominal": None, # TODO: LightGBM uses min_data_per_group = 100
"cat_smooth": 10.0,
"max_cat_threshold": 32,
"cat_include": 0.75,
"purify_boosting": False,
"purify_result": False,
"randomize_initial_feature_order": True,
Expand Down
68 changes: 42 additions & 26 deletions python/interpret-core/interpret/glassbox/_ebm/_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def boost(
reg_alpha,
reg_lambda,
max_delta_step,
missing,
max_leaves,
monotone_constraints,
greedy_ratio,
Expand Down Expand Up @@ -74,9 +75,12 @@ def boost(
learning_rate=intercept_learning_rate,
min_samples_leaf=0,
min_hessian=0.0,
reg_alpha=0.0,
reg_lambda=0.0,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
max_delta_step=0.0,
cat_smooth=develop.get_option("cat_smooth"),
max_cat_threshold=develop.get_option("max_cat_threshold"),
cat_include=develop.get_option("cat_include"),
max_leaves=1,
monotone_constraints=None,
)
Expand Down Expand Up @@ -106,7 +110,6 @@ def boost(
random_cyclic_ordering = np.arange(len(term_features), dtype=np.int64)

while step_idx < max_steps:
term_boost_flags_local = term_boost_flags
if state_idx >= 0:
# cyclic
if state_idx == 0:
Expand All @@ -124,27 +127,6 @@ def boost(

term_idx = random_cyclic_ordering[state_idx]

contains_nominals = any(
nominals[i] for i in term_features[term_idx]
)

if contains_nominals:
if develop.get_option("missing_lossguide_nominal"):
term_boost_flags_local |= (
Native.TermBoostFlags_MissingLossguide
)
else:
if develop.get_option("missing_lossguide_continuous"):
term_boost_flags_local |= (
Native.TermBoostFlags_MissingLossguide
)

if smoothing_rounds > 0 and (
nominal_smoothing or not contains_nominals
):
# modify some of our parameters temporarily
term_boost_flags_local |= Native.TermBoostFlags_RandomSplits

make_progress = False
if cyclic_state >= 1.0 or smoothing_rounds > 0:
# if cyclic_state is above 1.0 we make progress
Expand All @@ -156,6 +138,37 @@ def boost(
step_idx += 1
_, _, term_idx = heapq.heappop(heap)

contains_nominals = any(nominals[i] for i in term_features[term_idx])

term_boost_flags_local = term_boost_flags
reg_lambda_local = reg_lambda
min_samples_leaf_local = min_samples_leaf
if contains_nominals:
reg_lambda_local += develop.get_option("cat_l2")

if develop.get_option("min_samples_leaf_nominal") is not None:
min_samples_leaf_local = develop.get_option(
"min_samples_leaf_nominal"
)

if missing == "low":
term_boost_flags_local |= Native.TermBoostFlags_MissingLow
elif missing == "high":
term_boost_flags_local |= Native.TermBoostFlags_MissingHigh
elif missing == "separate":
term_boost_flags_local |= Native.TermBoostFlags_MissingSeparate
elif missing == "drop":
term_boost_flags_local |= Native.TermBoostFlags_MissingDrop
elif missing != "gain":
msg = f"Unrecognized missing option {missing}."
raise Exception(msg)

if smoothing_rounds > 0 and (
nominal_smoothing or not contains_nominals
):
# modify some of our parameters temporarily
term_boost_flags_local |= Native.TermBoostFlags_RandomSplits

if bestkey is None or state_idx >= 0:
term_monotone = None
if monotone_constraints is not None:
Expand All @@ -169,11 +182,14 @@ def boost(
term_idx=term_idx,
term_boost_flags=term_boost_flags_local,
learning_rate=learning_rate,
min_samples_leaf=min_samples_leaf,
min_samples_leaf=min_samples_leaf_local,
min_hessian=min_hessian,
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
reg_lambda=reg_lambda_local,
max_delta_step=max_delta_step,
cat_smooth=develop.get_option("cat_smooth"),
max_cat_threshold=develop.get_option("max_cat_threshold"),
cat_include=develop.get_option("cat_include"),
max_leaves=max_leaves,
monotone_constraints=term_monotone,
)
Expand Down
45 changes: 45 additions & 0 deletions python/interpret-core/interpret/glassbox/_ebm/_ebm.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def __init__(
reg_alpha,
reg_lambda,
max_delta_step,
missing,
max_leaves,
monotone_constraints,
objective,
Expand Down Expand Up @@ -408,6 +409,7 @@ def __init__(
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.max_delta_step = max_delta_step
self.missing = missing

self.max_leaves = max_leaves
if not is_private(self):
Expand Down Expand Up @@ -936,6 +938,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
reg_alpha = 0.0
reg_lambda = 0.0
max_delta_step = 0.0
missing = "low"
interactions = 0
monotone_constraints = None
n_intercept_rounds = 0
Expand All @@ -956,6 +959,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
reg_alpha = self.reg_alpha
reg_lambda = self.reg_lambda
max_delta_step = self.max_delta_step
missing = self.missing
interactions = self.interactions
monotone_constraints = self.monotone_constraints
n_intercept_rounds = develop.get_option("n_intercept_rounds_initial")
Expand Down Expand Up @@ -1072,6 +1076,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
reg_alpha,
reg_lambda,
max_delta_step,
missing,
self.max_leaves,
monotone_constraints,
greedy_ratio,
Expand Down Expand Up @@ -1344,6 +1349,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
reg_alpha,
reg_lambda,
max_delta_step,
missing,
self.max_leaves,
monotone_constraints,
greedy_ratio,
Expand Down Expand Up @@ -1468,6 +1474,7 @@ def fit(self, X, y, sample_weight=None, bags=None, init_score=None):
0.0,
0.0,
0.0,
missing,
1,
None,
greedy_ratio,
Expand Down Expand Up @@ -2764,6 +2771,22 @@ class ExplainableBoostingClassifier(ClassifierMixin, EBMModel):
L2 regularization.
max_delta_step : float, default=0.0
Used to limit the max output of tree leaves. <=0.0 means no constraint.
missing: str, default="low"
Method for handling missing values during boosting. The placement of the missing value bin can influence
the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to
affect lower bins, and vice versa. This parameter does not affect the final placement
of the missing bin in the model (the missing bin will remain at index 0 in the term_scores\_ attribute).
Possible values for missing are:
- `'low'`: Place the missing bin on the left side of the graphs.
- `'high'`: Place the missing bin on the right side of the graphs.
- `'separate'`: Place the missing bin in its own leaf during each boosting step,
effectively making it location-agnostic. This can lead to overfitting, especially
when the proportion of missing values is small.
- `'drop'`: Ignore the contribution of the missing bin, or split the feature into two leaves based on gain:
one for missing values and one for non-missing values.
- `'gain'`: Choose the best leaf for the missing value contribution at each boosting step, based on gain.
max_leaves : int, default=3
Maximum number of leaves allowed in each tree.
monotone_constraints: list of int, default=None
Expand Down Expand Up @@ -2923,6 +2946,7 @@ def __init__(
reg_alpha: Optional[float] = 0.0,
reg_lambda: Optional[float] = 0.0,
max_delta_step: Optional[float] = 0.0,
missing: str = "low",
max_leaves: int = 3,
monotone_constraints: Optional[Sequence[int]] = None,
objective: str = "log_loss",
Expand Down Expand Up @@ -2953,6 +2977,7 @@ def __init__(
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
max_delta_step=max_delta_step,
missing=missing,
max_leaves=max_leaves,
monotone_constraints=monotone_constraints,
objective=objective,
Expand Down Expand Up @@ -3120,6 +3145,22 @@ class ExplainableBoostingRegressor(RegressorMixin, EBMModel):
L2 regularization.
max_delta_step : float, default=0.0
Used to limit the max output of tree leaves. <=0.0 means no constraint.
missing: str, default="low"
Method for handling missing values during boosting. The placement of the missing value bin can influence
the resulting model graphs. For example, placing the bin on the "low" side may cause missing values to
affect lower bins, and vice versa. This parameter does not affect the final placement
of the missing bin in the model (the missing bin will remain at index 0 in the term_scores\_ attribute).
Possible values for missing are:
- `'low'`: Place the missing bin on the left side of the graphs.
- `'high'`: Place the missing bin on the right side of the graphs.
- `'separate'`: Place the missing bin in its own leaf during each boosting step,
effectively making it location-agnostic. This can lead to overfitting, especially
when the proportion of missing values is small.
- `'drop'`: Ignore the contribution of the missing bin, or split the feature into two leaves based on gain:
one for missing values and one for non-missing values.
- `'gain'`: Choose the best leaf for the missing value contribution at each boosting step, based on gain.
max_leaves : int, default=2
Maximum number of leaves allowed in each tree.
monotone_constraints: list of int, default=None
Expand Down Expand Up @@ -3279,6 +3320,7 @@ def __init__(
reg_alpha: Optional[float] = 0.0,
reg_lambda: Optional[float] = 0.0,
max_delta_step: Optional[float] = 0.0,
missing: str = "low",
max_leaves: int = 2,
monotone_constraints: Optional[Sequence[int]] = None,
objective: str = "rmse",
Expand Down Expand Up @@ -3309,6 +3351,7 @@ def __init__(
reg_alpha=reg_alpha,
reg_lambda=reg_lambda,
max_delta_step=max_delta_step,
missing=missing,
max_leaves=max_leaves,
monotone_constraints=monotone_constraints,
objective=objective,
Expand Down Expand Up @@ -3542,6 +3585,7 @@ def __init__(
reg_alpha=0.0,
reg_lambda=0.0,
max_delta_step=0.0,
missing=None,
max_leaves=max_leaves,
monotone_constraints=None,
objective="log_loss",
Expand Down Expand Up @@ -3820,6 +3864,7 @@ def __init__(
reg_alpha=0.0,
reg_lambda=0.0,
max_delta_step=0.0,
missing=None,
max_leaves=max_leaves,
monotone_constraints=None,
objective="rmse",
Expand Down
28 changes: 23 additions & 5 deletions python/interpret-core/interpret/utils/_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@ class Native:
TermBoostFlags_PurifyGain = 0x00000001
TermBoostFlags_DisableNewtonGain = 0x00000002
TermBoostFlags_DisableCategorical = 0x00000004
TermBoostFlags_MissingLossguide = 0x00000008
TermBoostFlags_PurifyUpdate = 0x00000010
TermBoostFlags_DisableNewtonUpdate = 0x00000020
TermBoostFlags_GradientSums = 0x00000040
TermBoostFlags_RandomSplits = 0x00000080
TermBoostFlags_PurifyUpdate = 0x00000008
TermBoostFlags_DisableNewtonUpdate = 0x00000010
TermBoostFlags_GradientSums = 0x00000020
TermBoostFlags_RandomSplits = 0x00000040
TermBoostFlags_MissingLow = 0x00000080
TermBoostFlags_MissingHigh = 0x00000100
TermBoostFlags_MissingSeparate = 0x00000200
TermBoostFlags_MissingDrop = 0x00000400

# CreateInteractionFlags
CreateInteractionFlags_Default = 0x00000000
Expand Down Expand Up @@ -1507,6 +1510,12 @@ def _initialize(self, is_debug):
ct.c_double,
# double maxDeltaStep
ct.c_double,
# double categoricalSmoothing
ct.c_double,
# int64_t maxCategoricalThreshold
ct.c_int64,
# double categoricalInclusionPercent
ct.c_double,
# int64_t * leavesMax
ct.c_void_p,
# MonotoneDirection * direction
Expand Down Expand Up @@ -1822,6 +1831,9 @@ def generate_term_update(
reg_alpha,
reg_lambda,
max_delta_step,
cat_smooth,
max_cat_threshold,
cat_include,
max_leaves,
monotone_constraints,
):
Expand All @@ -1837,6 +1849,9 @@ def generate_term_update(
reg_alpha: L1 regularization.
reg_lambda: L2 regularization.
max_delta_step: Used to limit the max output of tree leaves. <=0.0 means no constraint.
cat_smooth: Parameter used to determine which categories are included each boosting round and ordering.
max_cat_threshold: max number of categories to include each boosting round
cat_include: percentage of categories to include in each boosting round
max_leaves: Max leaf nodes on feature step.
monotone_constraints: monotone constraints (1=increasing, 0=none, -1=decreasing)
Expand Down Expand Up @@ -1881,6 +1896,9 @@ def generate_term_update(
reg_alpha,
reg_lambda,
max_delta_step,
cat_smooth,
max_cat_threshold,
cat_include,
Native._make_pointer(max_leaves_arr, np.int64, is_null_allowed=True),
Native._make_pointer(monotone_constraints, np.int32, is_null_allowed=True),
ct.byref(avg_gain),
Expand Down
Loading

0 comments on commit 57af6a8

Please sign in to comment.