Skip to content

Commit

Permalink
re #5: sped up similarity queries when only top-n results are required
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Feb 27, 2011
1 parent 923d49c commit 3a81762
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 5 deletions.
4 changes: 1 addition & 3 deletions src/gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,7 @@ def __getitem__(self, doc):
if self.numBest is None:
return allSims
else:
tops = [(docNo, sim) for docNo, sim in enumerate(allSims) if sim > 0]
tops = sorted(tops, key = lambda item: -item[1]) # sort by -sim => highest cossim first
return tops[ : self.numBest] # return at most numBest top 2-tuples (docId, docSim)
return matutils.full2sparse_clipped(allSims, self.numBest)


def __iter__(self):
Expand Down
20 changes: 18 additions & 2 deletions src/gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,22 @@ def full2sparse(vec, eps=1e-9):

dense2vec = full2sparse


def full2sparse_clipped(vec, topn, eps=1e-9):
"""
Like `full2sparse`, but only return the `topn` greatest elements (not all).
"""
# use numpy.argsort and only form tuples that are actually returned.
# this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
result = []
for i in numpy.argsort(vec)[::-1]:
if abs(vec[i]) > eps: # ignore features with near-zero weight
result.append((i, vec[i]))
if len(result) == topn:
break
return result


def corpus2dense(corpus, num_terms):
"""
Convert corpus into a dense numpy array (documents will be columns).
Expand Down Expand Up @@ -146,8 +162,8 @@ def vecLen(vec):
return vecLen


blas_nrm2 = blas('nrm2', numpy.array([], dtype = float))
blas_scal = blas('scal', numpy.array([], dtype = float))
blas_nrm2 = blas('nrm2', numpy.array([], dtype=float))
blas_scal = blas('scal', numpy.array([], dtype=float))

def unitVec(vec):
"""
Expand Down

0 comments on commit 3a81762

Please sign in to comment.