Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix return dtype for matutils.unitvec according to input dtype. Fix #1722 #1992

Merged
merged 30 commits into from
Apr 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
82b8d17
matutils.unitvec bug
o-P-o Jan 31, 2018
834b042
matutils.unitvec fix tests
o-P-o Jan 31, 2018
cf463b2
unitvec equal input and return types
o-P-o Mar 21, 2018
5656835
Update and rename test_unitvec to test_unitvec.py
o-P-o Mar 21, 2018
406ed66
Update matutils.py
o-P-o Mar 21, 2018
e71afcd
Update matutils.py
o-P-o Mar 22, 2018
fe36408
Update test_unitvec.py
o-P-o Mar 22, 2018
50d011c
Update and rename gensim/test_unitvec.py to gensim/test/test_matutils.py
o-P-o Mar 24, 2018
f1a40ac
Merge branch 'develop' into develop
o-P-o Mar 26, 2018
ead451f
Update matutils.py
o-P-o Mar 26, 2018
ae03291
Update test_matutils.py
o-P-o Mar 26, 2018
cab90a8
Update test_matutils.py
o-P-o Mar 26, 2018
141833d
Update following review
o-P-o Mar 27, 2018
218fe42
Update: attempt to solve Travis errors
o-P-o Mar 27, 2018
5fd1004
Update test_matutils.py
o-P-o Mar 27, 2018
80628c0
Update matutils.py
o-P-o Mar 28, 2018
768226b
Update matutils.py
o-P-o Mar 28, 2018
438f763
Update test_matutils.py
o-P-o Mar 28, 2018
f73076a
Addressing travis errors
o-P-o Mar 28, 2018
cd50529
Remove unnecessary dtype assignment
o-P-o Mar 28, 2018
11b0dde
return_norm statements for array instance case
o-P-o Mar 28, 2018
2e86529
Update test_matutils.py
o-P-o Mar 29, 2018
30d6284
Reduce line repetition
o-P-o Mar 29, 2018
d9cfb0d
Reduce repeated lines
o-P-o Mar 29, 2018
2e8eca1
Update test_matutils.py
o-P-o Mar 29, 2018
6beec75
Remove some redundant code from unitvec
o-P-o Apr 12, 2018
365a722
UnitvecTestCase update
o-P-o Apr 12, 2018
9327f68
Small typo fix
o-P-o Apr 12, 2018
a42cf38
Trailing white-space fix for Travis
o-P-o Apr 12, 2018
1078bdd
Improve code quality and remove no-op
o-P-o Apr 13, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,34 +692,39 @@ def unitvec(vec, norm='l2', return_norm=False):
"""
if norm not in ('l1', 'l2'):
raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)

if scipy.sparse.issparse(vec):
vec = vec.tocsr()
if norm == 'l1':
veclen = np.sum(np.abs(vec.data))
if norm == 'l2':
veclen = np.sqrt(np.sum(vec.data ** 2))
if veclen > 0.0:
if np.issubdtype(vec.dtype, np.int):
vec = vec.astype(np.float)
vec /= veclen
if return_norm:
return vec / veclen, veclen
return vec, veclen
else:
return vec / veclen
return vec
else:
if return_norm:
return vec, 1.
else:
return vec

if isinstance(vec, np.ndarray):
vec = np.asarray(vec, dtype=float)
if norm == 'l1':
veclen = np.sum(np.abs(vec))
if norm == 'l2':
veclen = blas_nrm2(vec)
if veclen > 0.0:
if np.issubdtype(vec.dtype, np.int):
vec = vec.astype(np.float)
if return_norm:
return blas_scal(1.0 / veclen, vec), veclen
return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen
else:
return blas_scal(1.0 / veclen, vec)
return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
else:
if return_norm:
return vec, 1
Expand Down
101 changes: 101 additions & 0 deletions gensim/test/test_matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
import unittest
import numpy as np
from scipy import sparse
from scipy.special import psi # gamma function utils

import gensim.matutils as matutils
Expand Down Expand Up @@ -141,6 +142,106 @@ def testDirichletExpectation(self):
self.assertTrue(np.allclose(known_good, test_values), msg)


def manual_unitvec(vec):
# manual unit vector calculation for UnitvecTestCase
vec = vec.astype(np.float)
if sparse.issparse(vec):
vec_sum_of_squares = vec.multiply(vec)
unit = 1. / np.sqrt(vec_sum_of_squares.sum())
return vec.multiply(unit)
elif not sparse.issparse(vec):
sum_vec_squared = np.sum(vec ** 2)
vec /= np.sqrt(sum_vec_squared)
return vec


class UnitvecTestCase(unittest.TestCase):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for reorganizing the tests! Looks much better now IMO

# test unitvec
def test_sparse_npfloat32(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertEqual(input_vector.dtype, unit_vector.dtype)

def test_sparse_npfloat64(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float64)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertEqual(input_vector.dtype, unit_vector.dtype)

def test_sparse_npint32(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))

def test_sparse_npint64(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int64)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))

def test_dense_npfloat32(self):
input_vector = np.random.uniform(size=(5,)).astype(np.float32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertEqual(input_vector.dtype, unit_vector.dtype)

def test_dense_npfloat64(self):
input_vector = np.random.uniform(size=(5,)).astype(np.float64)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertEqual(input_vector.dtype, unit_vector.dtype)

def test_dense_npint32(self):
input_vector = np.random.randint(10, size=5).astype(np.int32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))

def test_dense_npint64(self):
input_vector = np.random.randint(10, size=5).astype(np.int32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))

def test_sparse_python_float(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(float)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertEqual(input_vector.dtype, unit_vector.dtype)

def test_sparse_python_int(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(int)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))

def test_dense_python_float(self):
input_vector = np.random.uniform(size=(5,)).astype(float)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertEqual(input_vector.dtype, unit_vector.dtype)

def test_dense_python_int(self):
input_vector = np.random.randint(10, size=5).astype(int)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()