Skip to content

Commit

Permalink
clip y_pred before log_loss
Browse files Browse the repository at this point in the history
  • Loading branch information
rakow committed Jul 15, 2024
1 parent e7c822a commit 05835a5
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 95 deletions.
16 changes: 8 additions & 8 deletions matsim/calibration/run_simulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ def sample_y_null(shares: np.array, num_persons: int, num_samples: int):

def process_results(runs):
"""Process results of multiple simulations"""
from .utils import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder

print("Processing results in %s" % runs)
Expand Down Expand Up @@ -99,14 +98,15 @@ def process_results(runs):

# Compute likelihood with eps as 0.01%
eps = 0.0001
y_pred = np.clip(y_pred, eps, 1 - eps)

result = [
("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, eps=eps, normalize=False),
-log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, eps=eps, normalize=False)),
("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, eps=eps, normalize=True),
-log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, eps=eps, normalize=True)),
("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, eps=eps, normalize=False),
-log_loss(y_true, y_null, sample_weight=dfs.weight * dists, eps=eps, normalize=False)),
("Log likelihood", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=False),
-log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=False)),
("Log likelihood (normalized)", -log_loss(y_true, y_pred, sample_weight=dfs.weight, normalize=True),
-log_loss(y_true, y_pred, sample_weight=dfs.weight * dists, normalize=True)),
("Log likelihood (null)", -log_loss(y_true, y_null, sample_weight=dfs.weight, normalize=False),
-log_loss(y_true, y_null, sample_weight=dfs.weight * dists, normalize=False)),
("Mean Accuracy", np.mean(accs), np.mean(accs_d)),
("Samples", len(dfs), sum(dists)),
("Runs", len(pred_cols), len(pred_cols))
Expand Down
88 changes: 1 addition & 87 deletions matsim/calibration/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,6 @@

from scipy.special import xlogy
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.utils import (
assert_all_finite,
check_array,
check_consistent_length,
column_or_1d,
)
from sklearn.metrics._classification import _weighted_sum

from optuna.trial import TrialState

Expand Down Expand Up @@ -97,83 +90,4 @@ def _f(jvm_args, jar, config, params_path, run_dir, trial_number, run_args):
jvm_args, jar, config, run_dir, trial_number, yaml_arg, params_path, run_args
)

return _f


def log_loss(y_true, y_pred, *, eps="auto", normalize=True, sample_weight=None, labels=None):
"""Log loss, aka logistic loss or cross-entropy loss. Taken from scikit-learn 1.3."""
y_pred = check_array(
y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16]
)
if eps == "auto":
eps = np.finfo(y_pred.dtype).eps

check_consistent_length(y_pred, y_true, sample_weight)
lb = LabelBinarizer()

if labels is not None:
lb.fit(labels)
else:
lb.fit(y_true)

if len(lb.classes_) == 1:
if labels is None:
raise ValueError(
"y_true contains only one label ({0}). Please "
"provide the true labels explicitly through the "
"labels argument.".format(lb.classes_[0])
)
else:
raise ValueError(
"The labels array needs to contain at least two "
"labels for log_loss, "
"got {0}.".format(lb.classes_)
)

transformed_labels = lb.transform(y_true)

if transformed_labels.shape[1] == 1:
transformed_labels = np.append(
1 - transformed_labels, transformed_labels, axis=1
)

# Clipping
y_pred = np.clip(y_pred, eps, 1 - eps)

# If y_pred is of single dimension, assume y_true to be binary
# and then check.
if y_pred.ndim == 1:
y_pred = y_pred[:, np.newaxis]
if y_pred.shape[1] == 1:
y_pred = np.append(1 - y_pred, y_pred, axis=1)

# Check if dimensions are consistent.
transformed_labels = check_array(transformed_labels)
if len(lb.classes_) != y_pred.shape[1]:
if labels is None:
raise ValueError(
"y_true and y_pred contain different number of "
"classes {0}, {1}. Please provide the true "
"labels explicitly through the labels argument. "
"Classes found in "
"y_true: {2}".format(
transformed_labels.shape[1], y_pred.shape[1], lb.classes_
)
)
else:
raise ValueError(
"The number of classes in labels is different "
"from that in y_pred. Classes found in "
"labels: {0}".format(lb.classes_)
)

# Renormalize
y_pred_sum = y_pred.sum(axis=1)
if not np.isclose(y_pred_sum, 1, rtol=1e-15, atol=5 * eps).all():
raise ValueError(
"y_pred contains values not summing to 1."
)
y_pred = y_pred / y_pred_sum[:, np.newaxis]
loss = -xlogy(transformed_labels, y_pred).sum(axis=1)

return _weighted_sum(loss, sample_weight, normalize)
return _f

0 comments on commit 05835a5

Please sign in to comment.