diff --git a/gensim/matutils.py b/gensim/matutils.py index 66a241a1f9..80fd1e8c29 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -692,6 +692,7 @@ def unitvec(vec, norm='l2', return_norm=False): """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) + if scipy.sparse.issparse(vec): vec = vec.tocsr() if norm == 'l1': @@ -699,10 +700,13 @@ def unitvec(vec, norm='l2', return_norm=False): if norm == 'l2': veclen = np.sqrt(np.sum(vec.data ** 2)) if veclen > 0.0: + if np.issubdtype(vec.dtype, np.int): + vec = vec.astype(np.float) + vec /= veclen if return_norm: - return vec / veclen, veclen + return vec, veclen else: - return vec / veclen + return vec else: if return_norm: return vec, 1. @@ -710,16 +714,17 @@ def unitvec(vec, norm='l2', return_norm=False): return vec if isinstance(vec, np.ndarray): - vec = np.asarray(vec, dtype=float) if norm == 'l1': veclen = np.sum(np.abs(vec)) if norm == 'l2': veclen = blas_nrm2(vec) if veclen > 0.0: + if np.issubdtype(vec.dtype, np.int): + vec = vec.astype(np.float) if return_norm: - return blas_scal(1.0 / veclen, vec), veclen + return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen else: - return blas_scal(1.0 / veclen, vec) + return blas_scal(1.0 / veclen, vec).astype(vec.dtype) else: if return_norm: return vec, 1 diff --git a/gensim/test/test_matutils.py b/gensim/test/test_matutils.py index 549c10904b..b079df9e43 100644 --- a/gensim/test/test_matutils.py +++ b/gensim/test/test_matutils.py @@ -7,6 +7,7 @@ import logging import unittest import numpy as np +from scipy import sparse from scipy.special import psi # gamma function utils import gensim.matutils as matutils @@ -141,6 +142,106 @@ def testDirichletExpectation(self): self.assertTrue(np.allclose(known_good, test_values), msg) +def manual_unitvec(vec): + # manual unit vector calculation for UnitvecTestCase + vec = vec.astype(np.float) + if sparse.issparse(vec): + vec_sum_of_squares = vec.multiply(vec) + unit = 1. / np.sqrt(vec_sum_of_squares.sum()) + return vec.multiply(unit) + elif not sparse.issparse(vec): + sum_vec_squared = np.sum(vec ** 2) + vec /= np.sqrt(sum_vec_squared) + return vec + + +class UnitvecTestCase(unittest.TestCase): + # test unitvec + def test_sparse_npfloat32(self): + input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float32) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) + self.assertEqual(input_vector.dtype, unit_vector.dtype) + + def test_sparse_npfloat64(self): + input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float64) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) + self.assertEqual(input_vector.dtype, unit_vector.dtype) + + def test_sparse_npint32(self): + input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int32) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) + self.assertTrue(np.issubdtype(unit_vector.dtype, float)) + + def test_sparse_npint64(self): + input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int64) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) + self.assertTrue(np.issubdtype(unit_vector.dtype, float)) + + def test_dense_npfloat32(self): + input_vector = np.random.uniform(size=(5,)).astype(np.float32) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector, man_unit_vector)) + self.assertEqual(input_vector.dtype, unit_vector.dtype) + + def test_dense_npfloat64(self): + input_vector = np.random.uniform(size=(5,)).astype(np.float64) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector, man_unit_vector)) + self.assertEqual(input_vector.dtype, unit_vector.dtype) + + def test_dense_npint32(self): + input_vector = np.random.randint(10, size=5).astype(np.int32) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector, man_unit_vector)) + self.assertTrue(np.issubdtype(unit_vector.dtype, float)) + + def test_dense_npint64(self): + input_vector = np.random.randint(10, size=5).astype(np.int32) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector, man_unit_vector)) + self.assertTrue(np.issubdtype(unit_vector.dtype, float)) + + def test_sparse_python_float(self): + input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(float) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) + self.assertEqual(input_vector.dtype, unit_vector.dtype) + + def test_sparse_python_int(self): + input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(int) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) + self.assertTrue(np.issubdtype(unit_vector.dtype, float)) + + def test_dense_python_float(self): + input_vector = np.random.uniform(size=(5,)).astype(float) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector, man_unit_vector)) + self.assertEqual(input_vector.dtype, unit_vector.dtype) + + def test_dense_python_int(self): + input_vector = np.random.randint(10, size=5).astype(int) + unit_vector = matutils.unitvec(input_vector) + man_unit_vector = manual_unitvec(input_vector) + self.assertTrue(np.allclose(unit_vector, man_unit_vector)) + self.assertTrue(np.issubdtype(unit_vector.dtype, float)) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()