Skip to content

Commit

Permalink
Merge pull request #2362 from daxiongshu/fea-logloss-cupy
Browse files Browse the repository at this point in the history
[REVIEW] Binary and Multi classification log loss / cross entropy
  • Loading branch information
dantegd authored Jul 14, 2020
2 parents 9bb46fd + c19f605 commit 489a7d8
Show file tree
Hide file tree
Showing 4 changed files with 152 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
- PR #2308: Using fixture for Dask client to eliminate possiblity of not closing
- PR #2345: make C++ logger level definition to be the same as python layer
- PR #2329: Add short commit hash to conda package name
- PR #2362: Implement binary/multi-classification log loss with cupy
- PR #2363: Update threshold and make other changes for stress tests
- PR #2371: Updating MBSGD tests to use larger batches
- PR #2380: Pinning libcumlprims version to ease future updates
Expand Down
1 change: 1 addition & 0 deletions python/cuml/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from cuml.metrics.accuracy import accuracy_score
from cuml.metrics.cluster.adjustedrandindex import adjusted_rand_score
from cuml.metrics._ranking import roc_auc_score
from cuml.metrics._classification import log_loss
from cuml.metrics.cluster.homogeneity_score import homogeneity_score
from cuml.metrics.cluster.completeness_score import completeness_score
from cuml.metrics.cluster.mutual_info_score import mutual_info_score
Expand Down
104 changes: 104 additions & 0 deletions python/cuml/metrics/_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#
# Copyright (c) 2020, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import cupy as cp
import numpy as np
from cuml.common.memory_utils import with_cupy_rmm
from cuml.common import input_to_cuml_array


@with_cupy_rmm
def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None):
""" Log loss, aka logistic loss or cross-entropy loss.
This is the loss function used in (multinomial) logistic regression
and extensions of it such as neural networks, defined as the negative
log-likelihood of a logistic model that returns ``y_pred`` probabilities
for its training data ``y_true``.
The log loss is only defined for two or more labels.
Parameters
----------
y_true : array-like, shape = (n_samples,)
y_pred : array-like of float,
shape = (n_samples, n_classes) or (n_samples,)
eps : float
Log loss is undefined for p=0 or p=1, so probabilities are
clipped to max(eps, min(1 - eps, p)).
normalize : bool, optional (default=True)
If true, return the mean loss per sample.
Otherwise, return the sum of the per-sample losses.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
loss : float
Examples
--------
>>> from cuml.metrics import log_loss
>>> import numpy as np
>>> log_loss(np.array([1, 0, 0, 1]),
... np.array([[.1, .9], [.9, .1], [.8, .2], [.35, .65]]))
0.21616...
References
----------
C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
p. 209.
Notes
-----
The logarithm used is the natural logarithm (base-e).
"""
y_true, n_rows, n_cols, ytype = \
input_to_cuml_array(y_true, check_dtype=[np.int32, np.int64,
np.float32, np.float64])

y_true = y_true.to_output('cupy')
if y_true.dtype.kind == 'f' and np.any(y_true != y_true.astype(int)):
raise ValueError("'y_true' can only have integer values")
if y_true.min() < 0:
raise ValueError("'y_true' cannot have negative values")

y_pred, _, _, _ = \
input_to_cuml_array(y_pred, check_dtype=[np.int32, np.int64,
np.float32, np.float64],
check_rows=n_rows)

y_pred = y_pred.to_output('cupy')
y_true_max = y_true.max()
if (y_pred.ndim == 1 and y_true_max > 1) \
or (y_pred.ndim > 1 and y_pred.shape[1] <= y_true_max):
raise ValueError("The shape of y_pred doesn't "
"match the number of classes")

y_true = y_true.astype('int32')
y_pred = cp.clip(y_pred, eps, 1 - eps)
if y_pred.ndim == 1:
y_pred = cp.expand_dims(y_pred, axis=1)
if y_pred.shape[1] == 1:
y_pred = cp.hstack([1 - y_pred, y_pred])

y_pred /= cp.sum(y_pred, axis=1, keepdims=True)
loss = -cp.log(y_pred)[cp.arange(y_pred.shape[0]), y_true]
return _weighted_sum(loss, sample_weight, normalize).item()


def _weighted_sum(sample_score, sample_weight, normalize):
if normalize:
return cp.average(sample_score, weights=sample_weight)
elif sample_weight is not None:
return cp.dot(sample_score, sample_weight)
else:
return sample_score.sum()
46 changes: 46 additions & 0 deletions python/cuml/test/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score as sk_acc_score
from sklearn.metrics import log_loss as sklearn_log_loss
from sklearn.metrics.cluster import adjusted_rand_score as sk_ars
from sklearn.metrics.cluster import homogeneity_score as sk_homogeneity_score
from sklearn.metrics.cluster import completeness_score as sk_completeness_score
Expand All @@ -52,6 +53,7 @@
from cuml.common import has_scipy

from cuml.metrics import roc_auc_score
from cuml.metrics import log_loss
from sklearn.metrics import roc_auc_score as sklearn_roc_auc_score


Expand Down Expand Up @@ -643,3 +645,47 @@ def test_roc_auc_score_at_limits():

with pytest.raises(ValueError, match=err_msg):
roc_auc_score(y_true, y_pred)


def test_log_loss():
y_true = np.array([0, 0, 1, 1])
y_pred = np.array([0.1, 0.4, 0.35, 0.8])
assert_almost_equal(log_loss(y_true, y_pred),
sklearn_log_loss(y_true, y_pred))

y_true = np.array([0, 0, 1, 1, 0])
y_pred = np.array([0.8, 0.4, 0.4, 0.8, 0.8])
assert_almost_equal(log_loss(y_true, y_pred),
sklearn_log_loss(y_true, y_pred))


@pytest.mark.parametrize('n_samples', [500, 500000])
@pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64])
def test_log_loss_random(n_samples, dtype):

y_true, _, _, _ = generate_random_labels(
lambda rng: rng.randint(0, 10, n_samples).astype(dtype))

y_pred, _, _, _ = generate_random_labels(
lambda rng: rng.rand(n_samples, 10))

assert_almost_equal(log_loss(y_true, y_pred),
sklearn_log_loss(y_true, y_pred))


def test_log_loss_at_limits():
y_true = np.array([0., 1., 2.], dtype=np.float)
y_pred = np.array([0., 0.5, 1.], dtype=np.float)

err_msg = ("The shape of y_pred doesn't "
"match the number of classes")

with pytest.raises(ValueError, match=err_msg):
log_loss(y_true, y_pred)

y_true = np.array([0., 0.5, 1.0], dtype=np.float)
y_pred = np.array([0., 0.5, 1.], dtype=np.float)

err_msg = ("'y_true' can only have integer values")
with pytest.raises(ValueError, match=err_msg):
log_loss(y_true, y_pred)

0 comments on commit 489a7d8

Please sign in to comment.