Skip to content

Commit

Permalink
Fix ALS loss calculation on GPU (#663)
Browse files Browse the repository at this point in the history
The GPU ALS model would sometimes return incorrect results with the
`calculate_training_loss` parameter enabled. This happend when the
number_of_users * number_of_items was bigger than 2**31 due to
an overflow in the loss function calculation.

Fix and add tests that would have caught this bug

Closes #367
Closes #441
  • Loading branch information
benfred authored Jun 6, 2023
1 parent 6c9db63 commit d869ed3
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 3 deletions.
1 change: 1 addition & 0 deletions implicit/cpu/_als.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ def _least_squares_cg(integral[:] indptr, integral[:] indices, float[:] data,


def calculate_loss(Cui, X, Y, regularization, num_threads=0):
""" Calculates the loss for an ALS model """
return _calculate_loss(Cui, Cui.indptr, Cui.indices, Cui.data.astype('float32'),
X, Y, regularization, num_threads)

Expand Down
3 changes: 3 additions & 0 deletions implicit/cpu/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,3 +556,6 @@ def least_squares_cg(Cui, X, Y, regularization, num_threads=0, cg_steps=3):
rsold = rsnew

X[u] = x


calculate_loss = _als.calculate_loss
5 changes: 3 additions & 2 deletions implicit/gpu/als.cu
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ float LeastSquaresSolver::calculate_loss(const CSRMatrix &Cui, const Matrix &X,
size_t item_count = Y.rows, factors = Y.cols, user_count = X.rows;

Matrix YtY(factors, factors, NULL);
calculate_yty(Y, &YtY, regularization);
calculate_yty(Y, &YtY, 0.0);

float temp[2] = {0, 0};
Matrix output(2, 1, temp);
Expand All @@ -276,7 +276,8 @@ float LeastSquaresSolver::calculate_loss(const CSRMatrix &Cui, const Matrix &X,
CHECK_CUDA(cudaDeviceSynchronize());
output.to_host(temp);

return temp[0] / (temp[1] + Cui.rows * Cui.cols - Cui.nonzeros);
size_t rows = Cui.rows, cols = Cui.cols;
return temp[0] / (temp[1] + rows * cols - Cui.nonzeros);
}

LeastSquaresSolver::~LeastSquaresSolver() {
Expand Down
14 changes: 14 additions & 0 deletions implicit/gpu/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,17 @@ def __setstate__(self, state):
self._XtX = implicit.gpu.Matrix(self._XtX)
if self._YtY is not None:
self._YtY = implicit.gpu.Matrix(self._YtY)


def calculate_loss(Cui, X, Y, regularization, solver=None):
"""Calculates the loss for an ALS model"""
if not isinstance(Cui, implicit.gpu.CSRMatrix):
Cui = implicit.gpu.CSRMatrix(Cui)
if not isinstance(X, implicit.gpu.Matrix):
X = implicit.gpu.Matrix(X)
if not isinstance(Y, implicit.gpu.Matrix):
Y = implicit.gpu.Matrix(Y)
if solver is None:
solver = implicit.gpu.LeastSquaresSolver()

return solver.calculate_loss(Cui, X, Y, regularization)
52 changes: 51 additions & 1 deletion tests/als_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,56 @@ def test_incremental_retrain(use_gpu):
assert set(ids) == {1, 100, 101}


@pytest.mark.parametrize("use_gpu", [True, False] if HAS_CUDA else [False])
def test_calculate_loss_simple(use_gpu):
if use_gpu:
calculate_loss = implicit.gpu.als.calculate_loss

else:
calculate_loss = implicit.cpu.als.calculate_loss

# the only user has liked item 0, but not interacted with item 1
n_users, n_items = 1, 2
ratings = coo_matrix(([1.0], ([0], [0])), shape=(n_users, n_items)).tocsr()

# factors are designed to be perfectly wrong, to test loss function
item_factors = np.array([[0.0], [1.0]], dtype="float32")
user_factors = np.array([[1.0]], dtype="float32")

loss = calculate_loss(ratings, user_factors, item_factors, regularization=0)
assert loss == pytest.approx(1.0)

loss = calculate_loss(ratings, user_factors, item_factors, regularization=1.0)
assert loss == pytest.approx(2.0)


@pytest.mark.skipif(not implicit.gpu.HAS_CUDA, reason="needs cuda build")
@pytest.mark.parametrize("n_users", [2**13, 2**19])
@pytest.mark.parametrize("n_items", [2**19])
@pytest.mark.parametrize("n_samples", [2**20])
@pytest.mark.parametrize("regularization", [0.0, 1.0, 500000.0])
def test_gpu_loss(n_users, n_items, n_samples, regularization):
# we used to have some errors in the gpu loss function
# if n_items * n_users >2**31. Test out that the loss on the gpu
# matches that on the cpu
# https://github.com/benfred/implicit/issues/441
# https://github.com/benfred/implicit/issues/367
liked_items = np.random.randint(0, n_items, n_samples)
liked_users = np.random.randint(0, n_users, n_samples)
ratings = coo_matrix(
(np.ones(n_samples), (liked_users, liked_items)), shape=(n_users, n_items)
).tocsr()

factors = 32
item_factors = np.random.random((n_items, factors)).astype("float32")
user_factors = np.random.random((n_users, factors)).astype("float32")

gpu_loss = implicit.gpu.als.calculate_loss(ratings, user_factors, item_factors, regularization)
cpu_loss = implicit.cpu.als.calculate_loss(ratings, user_factors, item_factors, regularization)

assert gpu_loss == pytest.approx(cpu_loss, rel=1e-5)


def test_calculate_loss_segfault():
# this code used to segfault, because of a bug in calculate_loss
factors = 1
Expand All @@ -311,5 +361,5 @@ def test_calculate_loss_segfault():
user_factors = np.random.random((n_users, factors)).astype("float32")
c_ui = coo_matrix(([1.0, 1.0], ([0, 1], [0, 1])), shape=(n_users, n_items)).tocsr()

loss = implicit.cpu._als.calculate_loss(c_ui, user_factors, item_factors, regularization)
loss = implicit.cpu.als.calculate_loss(c_ui, user_factors, item_factors, regularization)
assert loss > 0

0 comments on commit d869ed3

Please sign in to comment.