From 4c6061c5206c7d14a3b66b1ee519aae118861734 Mon Sep 17 00:00:00 2001 From: Quoc-Tuan Truong Date: Tue, 10 Sep 2019 17:29:56 +0800 Subject: [PATCH] Remove usage of old Dataset from IBPR model (#227) --- cornac/models/ibpr/ibpr.py | 77 ++++++++++++-------------------- cornac/models/ibpr/recom_ibpr.py | 20 ++------- examples/ibpr_example.py | 5 ++- 3 files changed, 35 insertions(+), 67 deletions(-) diff --git a/cornac/models/ibpr/ibpr.py b/cornac/models/ibpr/ibpr.py index 3be9ac04e..9442a23e7 100644 --- a/cornac/models/ibpr/ibpr.py +++ b/cornac/models/ibpr/ibpr.py @@ -13,44 +13,15 @@ # limitations under the License. # ============================================================================ -import random import numpy as np import torch +from tqdm import tqdm -from ...utils.data_utils import Dataset -"""Firstly, we define a helper function to generate\sample training ordinal triplets: - Step 1: - given rated item i, randomly choose item j and check whether rating of j is lower than i, - if not randomly sample another item. - each row of the sampled data in the following form: - [userId itemId_i itemId_j] - for each user u, he/she prefers item i over item j. - """ - - -def sample_data(X, data): - sampled_data = np.zeros((data.shape[0], 5), dtype=np.int) - data = data.astype(int) - - for k in range(0, data.shape[0]): - u = data[k, 0] - i = data[k, 1] - ratingi = data[k, 2] - j = random.randint(0, X.shape[1] - 1) - - while X[u, j] > ratingi: - j = random.randint(0, X.shape[1] - 1) - - sampled_data[k, :] = [u, i, j, ratingi, X[u, j]] - - return sampled_data - - -def ibpr(X, data, k, lamda=0.001, n_epochs=150, learning_rate=0.05, batch_size=100, init_params=None): - # X = sp.csr_matrix(X) - Data = Dataset(data) +def ibpr(train_set, k, lamda=0.001, n_epochs=150, learning_rate=0.05, batch_size=100, + init_params=None, verbose=False): + X = train_set.csr_matrix # Initial user factors if init_params['U'] is None: @@ -67,19 +38,21 @@ def ibpr(X, data, k, lamda=0.001, n_epochs=150, learning_rate=0.05, batch_size=1 V = torch.from_numpy(V) optimizer = torch.optim.Adam([U, V], lr=learning_rate) - for epoch in range(n_epochs): - num_steps = int(Data.data.shape[0] / batch_size) - for i in range(1, num_steps + 1): - batch_c, _ = Data.next_batch(batch_size) - # print(batch_c, idx) - sampled_batch = sample_data(X, batch_c) - regU = U[sampled_batch[:, 0], :] - regI = V[sampled_batch[:, 1], :] - regJ = V[sampled_batch[:, 2], :] + for epoch in range(1, n_epochs + 1): + sum_loss = 0. + count = 0 + progress_bar = tqdm(total=train_set.num_batches(batch_size), + desc='Epoch {}/{}'.format(epoch, n_epochs), + disable=not verbose) + + for batch_u, batch_i, batch_j in train_set.uij_iter(batch_size, shuffle=True): + regU = U[batch_u, :] + regI = V[batch_i, :] + regJ = V[batch_j, :] - regU_unq = U[np.unique(sampled_batch[:, 0]), :] - regI_unq = V[np.unique(sampled_batch[:, 1:]), :] + regU_unq = U[np.unique(batch_u), :] + regI_unq = V[np.union1d(batch_i, batch_j), :] regU_norm = regU / regU.norm(dim=1)[:, None] regI_norm = regI / regI.norm(dim=1)[:, None] @@ -88,14 +61,22 @@ def ibpr(X, data, k, lamda=0.001, n_epochs=150, learning_rate=0.05, batch_size=1 Scorei = torch.acos(torch.clamp(torch.sum(regU_norm * regI_norm, dim=1), -1 + 1e-7, 1 - 1e-7)) Scorej = torch.acos(torch.clamp(torch.sum(regU_norm * regJ_norm, dim=1), -1 + 1e-7, 1 - 1e-7)) - loss = lamda * (regU_unq.norm().pow(2) + regI_unq.norm().pow(2)) - torch.log( - torch.sigmoid(Scorej - Scorei)).sum() + loss = lamda * (regU_unq.norm().pow(2) + regI_unq.norm().pow(2)) \ + - torch.log(torch.sigmoid(Scorej - Scorei)).sum() optimizer.zero_grad() loss.backward() optimizer.step() - print('epoch:', epoch, 'loss:', loss) - # since the user's preference is defined by the angular distance, we can normalize the user/item vectors without changing the ranking + sum_loss += loss.data.item() + count += len(batch_u) + if count % (batch_size * 10) == 0: + progress_bar.set_postfix(loss=(sum_loss / count)) + progress_bar.update(1) + + progress_bar.close() + + # since the user's preference is defined by the angular distance, + # we can normalize the user/item vectors without changing the ranking U = torch.nn.functional.normalize(U, p=2, dim=1) V = torch.nn.functional.normalize(V, p=2, dim=1) U = U.data.cpu().numpy() diff --git a/cornac/models/ibpr/recom_ibpr.py b/cornac/models/ibpr/recom_ibpr.py index c9727870e..48617f7cc 100644 --- a/cornac/models/ibpr/recom_ibpr.py +++ b/cornac/models/ibpr/recom_ibpr.py @@ -65,7 +65,7 @@ class IBPR(Recommender): In Proceedings of the 2017 ACM on Conference on Information and Knowledge Management (pp. 1389-1398). ACM. """ - def __init__(self, k=20, max_iter=100, learning_rate=0.05, lamda=0.001, batch_size=100, name="ibpr", trainable=True, + def __init__(self, k=20, max_iter=100, learning_rate=0.05, lamda=0.001, batch_size=100, name="IBPR", trainable=True, verbose=False, init_params=None): Recommender.__init__(self, name=name, trainable=trainable, verbose=verbose) self.k = k @@ -98,25 +98,11 @@ def fit(self, train_set, val_set=None): from .ibpr import ibpr - X = self.train_set.matrix - - # change the data to original user Id item Id and rating format - cooX = X.tocoo() - data = np.ndarray(shape=(len(cooX.data), 3), dtype=float) - data[:, 0] = cooX.row - data[:, 1] = cooX.col - data[:, 2] = cooX.data - - if self.verbose: - print('Learning...') - res = ibpr(X, data, k=self.k, n_epochs=self.max_iter, lamda=self.lamda, learning_rate=self.learning_rate, - batch_size=self.batch_size, init_params=self.init_params) + res = ibpr(self.train_set, k=self.k, n_epochs=self.max_iter, lamda=self.lamda, learning_rate=self.learning_rate, + batch_size=self.batch_size, init_params=self.init_params, verbose=self.verbose) self.U = np.asarray(res['U']) self.V = np.asarray(res['V']) - if self.verbose: - print('Learning completed') - return self def score(self, user_idx, item_idx=None): diff --git a/examples/ibpr_example.py b/examples/ibpr_example.py index d2467af57..db70c6b4c 100644 --- a/examples/ibpr_example.py +++ b/examples/ibpr_example.py @@ -22,10 +22,11 @@ ml_1m = movielens.load_1m() # Instantiate an evaluation method. -ratio_split = RatioSplit(data=ml_1m, test_size=0.2, rating_threshold=1.0, exclude_unknowns=True) +ratio_split = RatioSplit(data=ml_1m, test_size=0.2, rating_threshold=1.0, + exclude_unknowns=True, verbose=True) # Instantiate a IBPR recommender model. -ibpr = IBPR(k=10, init_params={'U': None, 'V': None}) +ibpr = IBPR(k=10, init_params={'U': None, 'V': None}, verbose=True) # Instantiate evaluation metrics. rec_20 = cornac.metrics.Recall(k=20)