From be64a0ffe94a09c8d214432332e8c03875a9decf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 11 Aug 2012 11:34:28 +0200 Subject: [PATCH] use built-in `hash` instead of zlib --- gensim/corpora/hashdictionary.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index df737bf0c2..1f24d29b73 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -29,19 +29,19 @@ class RestrictedHash: """ Mimics a dict, using a restricted hash. """ - def __init__(self, key_range=32000, hash=zlib.adler32, maintain_reverse=True, debug=False): + def __init__(self, key_range=32000, myhash=hash, maintain_reverse=True, debug=False): """ - Initialize a RestrictedHash with given key range and hash function. + Initialize a RestrictedHash with given key range and hash function. maintain_reverse determines whether to keep a dict mapping the inverse hash function.. """ self.key_range = key_range - self.hash = hash + self.myhash = myhash self.debug = debug self.maintain_reverse = maintain_reverse self.reverse = {} self.debug_reverse = {} - + def __len__(self): """ Reports the size of the domain of possible keys. @@ -82,15 +82,15 @@ def values(self): def keys(self): return self.reverse.values() - + def subset(self, key_subset): self.reverse = dict((k,v) for k, v in self.reverse.iteritems() if k in key_subset) def restricted_hash(self, key): """Calculates the hash mod the range""" - return self.hash(key) % self.key_range + return self.myhash(key) % self.key_range + - class HashDictionary(utils.SaveLoad, UserDict.DictMixin): """ @@ -100,8 +100,8 @@ class HashDictionary(utils.SaveLoad, UserDict.DictMixin): bag-of-words representation: a list of (word_id, word_frequency) 2-tuples """ - def __init__(self, documents=None, id_range=32000, hash=zlib.adler32, debug=False): - self.token2id = RestrictedHash(key_range=id_range, hash=hash, debug=debug) + def __init__(self, documents=None, id_range=32000, myhash=hash, debug=False): + self.token2id = RestrictedHash(key_range=id_range, myhash=myhash, debug=debug) self.id2token = self.token2id.reverse # reverse mapping for token2id; only formed on request, to save memory self.dfs = {} # document frequencies: tokenId -> in how many documents this token appeared self.num_docs = 0 # number of documents processed @@ -129,7 +129,7 @@ def __len__(self): def __str__(self): return ("HashDictionary(%i id range)" % len(self)) - + @staticmethod def from_documents(documents): @@ -169,7 +169,7 @@ def doc2bow(self, document, allow_update=False, return_missing=False): by one. If `allow_update` is **not** set, this function is `const`, aka read-only. - + """ result = {} missing = {} @@ -224,8 +224,8 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): if keep_n is not None: good_ids = good_ids[:keep_n] - self.token2id.subset(key_subset=good_ids) - self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() + self.token2id.subset(key_subset=good_ids) + self.dfs = dict((tokenid, freq) for tokenid, freq in self.dfs.iteritems() if tokenid in good_ids) logger.info("keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents" % (len(good_ids), no_below, no_above_abs, 100.0 * no_above))