From 453e5b47d6f000c22b37e26dd86fa93db30671b2 Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 19:58:05 +0000 Subject: [PATCH 01/14] first attempt to convert few lines into numpy-style doc --- gensim/models/rpmodel.py | 41 ++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 0c8f7c8b26..7cf9a17045 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -5,35 +5,40 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -import logging +""" +Objects of this class allow building and maintaining a model for Random Projections +(also known as Random Indexing). -import numpy as np +For theoretical background on RP, see: Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." -from gensim import interfaces, matutils, utils + The main methods are: + + 1. constructor, which creates the random projection matrix + 2. the [] method, which transforms a simple count representation into the TfIdf + space. +Model persistency is achieved via its load/save methods. -logger = logging.getLogger('gensim.models.rpmodel') +Examples: +--------- +>>> from gensim.models import rpmmodel +>>> rp = RpModel(corpus) +>>> print(rp[some_doc]) +>>> rp.save('/tmp/foo.rp_model') +""" -class RpModel(interfaces.TransformationABC): - """ - Objects of this class allow building and maintaining a model for Random Projections - (also known as Random Indexing). For theoretical background on RP, see: +import logging - Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." +import numpy as np - The main methods are: +from gensim import interfaces, matutils, utils - 1. constructor, which creates the random projection matrix - 2. the [] method, which transforms a simple count representation into the TfIdf - space. - >>> rp = RpModel(corpus) - >>> print(rp[some_doc]) - >>> rp.save('/tmp/foo.rp_model') +logger = logging.getLogger('gensim.models.rpmodel') - Model persistency is achieved via its load/save methods. - """ + +class RpModel(interfaces.TransformationABC): def __init__(self, corpus, id2word=None, num_topics=300): """ From 79c71bad9955a700aa6e28c68926c06d7a7937bd Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 20:13:49 +0000 Subject: [PATCH 02/14] added parameters in documentation --- gensim/models/rpmodel.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 7cf9a17045..f48e8280ed 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -45,6 +45,14 @@ def __init__(self, corpus, id2word=None, num_topics=300): `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. If not set, it will be determined from the corpus. + + + Parameters + ---------- + corpus : interfaces.CorpusABC + id2word : dict of int tostring + num_topics : int + """ self.id2word = id2word self.num_topics = num_topics @@ -57,6 +65,12 @@ def __str__(self): def initialize(self, corpus): """ Initialize the random projection matrix. + + + Parameters + ---------- + corpus : interfaces.CorpusABC + """ if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") @@ -80,6 +94,10 @@ def initialize(self, corpus): def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. + + Parameters + ---------- + bow : interfaces.CorpusABC or document in sparse document format (=sequence of 2-tuples). """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) From 4d748a6718b286dfee4dd2bb04c40ad52875f709 Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 20:32:18 +0000 Subject: [PATCH 03/14] more documentation --- gensim/models/rpmodel.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index f48e8280ed..4797682c3d 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -69,7 +69,7 @@ def initialize(self, corpus): Parameters ---------- - corpus : interfaces.CorpusABC + corpus : :class:`~interfaces.CorpusABC` """ if self.id2word is None: @@ -97,7 +97,13 @@ def __getitem__(self, bow): Parameters ---------- - bow : interfaces.CorpusABC or document in sparse document format (=sequence of 2-tuples). + bow : :class:`~interfaces.CorpusABC` (iterable of documents) or list of (int, int). + + Examples: + ------------- + >>> rp = RpModel(corpus) + >>> print(rp[some_doc]) + """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) @@ -119,5 +125,12 @@ def __getitem__(self, bow): ] def __setstate__(self, state): + """ + Sets the internal state and updates freshly_loaded to True. Called when unpicked. + + Parameters + ---------- + state : state of the class + """ self.__dict__ = state self.freshly_loaded = True From 00adee93f51367358427f070f161eee250adaefa Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 21:00:55 +0000 Subject: [PATCH 04/14] few corrections --- gensim/models/rpmodel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 4797682c3d..f7b5036f60 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -49,8 +49,8 @@ def __init__(self, corpus, id2word=None, num_topics=300): Parameters ---------- - corpus : interfaces.CorpusABC - id2word : dict of int tostring + corpus : :class:`~gensim.interfaces.CorpusABC` + id2word : dict of (int, string) num_topics : int """ @@ -130,7 +130,8 @@ def __setstate__(self, state): Parameters ---------- - state : state of the class + state : dict + State of the class """ self.__dict__ = state self.freshly_loaded = True From eecb40a79a2ff05e37046e4d8d5fd226c5bb3613 Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 21:03:43 +0000 Subject: [PATCH 05/14] show inheritance and undoc members --- docs/src/models/rpmodel.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/models/rpmodel.rst b/docs/src/models/rpmodel.rst index 47eba01262..c445c371e7 100644 --- a/docs/src/models/rpmodel.rst +++ b/docs/src/models/rpmodel.rst @@ -5,4 +5,5 @@ :synopsis: Random Projections :members: :inherited-members: - + :undoc-members: + :show-inheritance: From 9d16880390fe0ce8d71cd7fbc48a53f98b4eb3ee Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 21:08:39 +0000 Subject: [PATCH 06/14] show special members --- docs/src/models/rpmodel.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/models/rpmodel.rst b/docs/src/models/rpmodel.rst index c445c371e7..91ef71872a 100644 --- a/docs/src/models/rpmodel.rst +++ b/docs/src/models/rpmodel.rst @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__ From 0c389bb25540c78933478c20f36d56fcafe7b1ac Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 21:11:59 +0000 Subject: [PATCH 07/14] example is executable now --- gensim/models/rpmodel.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index f7b5036f60..01428255fc 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -22,8 +22,13 @@ Examples: --------- ->>> from gensim.models import rpmmodel +>>> from gensim.models import RpModel +>>> from gensim.corpora import Dictionary +>>> from gensim.test.utils import common_texts +>>> dictionary = Dictionary(common_texts) +>>> corpus = [dictionary.doc2bow(text) for text in common_texts] >>> rp = RpModel(corpus) +>>> some_doc = dictionary.doc2bow(common_texts[0]) >>> print(rp[some_doc]) >>> rp.save('/tmp/foo.rp_model') """ From 7e13ad82cffbc2eed73291f003d275a2af007bcd Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 21:17:46 +0000 Subject: [PATCH 08/14] link to the paper added, named parameters --- gensim/models/rpmodel.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 01428255fc..c96a49fcd0 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -9,7 +9,8 @@ Objects of this class allow building and maintaining a model for Random Projections (also known as Random Indexing). -For theoretical background on RP, see: Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." +For theoretical background on RP, see [1]_. +: Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." The main methods are: @@ -31,6 +32,10 @@ >>> some_doc = dictionary.doc2bow(common_texts[0]) >>> print(rp[some_doc]) >>> rp.save('/tmp/foo.rp_model') + + +.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis, https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf + """ import logging @@ -55,8 +60,13 @@ def __init__(self, corpus, id2word=None, num_topics=300): Parameters ---------- corpus : :class:`~gensim.interfaces.CorpusABC` + Iterable of documents + id2word : dict of (int, string) + Mapping from word ids (integers) to words (strings) + num_topics : int + Number of topics """ self.id2word = id2word @@ -75,6 +85,7 @@ def initialize(self, corpus): Parameters ---------- corpus : :class:`~interfaces.CorpusABC` + Iterable of documents """ if self.id2word is None: @@ -103,6 +114,7 @@ def __getitem__(self, bow): Parameters ---------- bow : :class:`~interfaces.CorpusABC` (iterable of documents) or list of (int, int). + Input document Examples: ------------- From 0832837ba72595ad3cbe33da810b4301cfcfc517 Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 21:26:34 +0000 Subject: [PATCH 09/14] fixed doc --- gensim/models/rpmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index c96a49fcd0..44e89507d0 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -116,8 +116,8 @@ def __getitem__(self, bow): bow : :class:`~interfaces.CorpusABC` (iterable of documents) or list of (int, int). Input document - Examples: - ------------- + Examples + ---------- >>> rp = RpModel(corpus) >>> print(rp[some_doc]) From fcfe8288faaa2f914fecb41b8851f79a2b513348 Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 21:33:00 +0000 Subject: [PATCH 10/14] fixed doc --- gensim/models/rpmodel.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 44e89507d0..616b9c0dff 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -113,12 +113,18 @@ def __getitem__(self, bow): Parameters ---------- - bow : :class:`~interfaces.CorpusABC` (iterable of documents) or list of (int, int). - Input document + bow : :class:`~interfaces.CorpusABC` + Input document is an iterable of documents or list of (int, int) Examples ---------- + >>> from gensim.models import RpModel + >>> from gensim.corpora import Dictionary + >>> from gensim.test.utils import common_texts + >>> dictionary = Dictionary(common_texts) + >>> corpus = [dictionary.doc2bow(text) for text in common_texts] >>> rp = RpModel(corpus) + >>> some_doc = dictionary.doc2bow(common_texts[0]) >>> print(rp[some_doc]) """ From fb3e133cfecce742ca9590b4e178979bff623b75 Mon Sep 17 00:00:00 2001 From: preich Date: Tue, 19 Dec 2017 23:33:04 +0000 Subject: [PATCH 11/14] fixed whitespaces --- gensim/models/rpmodel.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 616b9c0dff..3bce7016f8 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -23,7 +23,7 @@ Examples: --------- ->>> from gensim.models import RpModel +>>> from gensim.models import RpModel >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import common_texts >>> dictionary = Dictionary(common_texts) @@ -34,7 +34,8 @@ >>> rp.save('/tmp/foo.rp_model') -.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis, https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf +.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis, + https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf """ @@ -59,7 +60,7 @@ def __init__(self, corpus, id2word=None, num_topics=300): Parameters ---------- - corpus : :class:`~gensim.interfaces.CorpusABC` + corpus : :class:`~gensim.interfaces.CorpusABC` Iterable of documents id2word : dict of (int, string) @@ -117,7 +118,7 @@ def __getitem__(self, bow): Input document is an iterable of documents or list of (int, int) Examples - ---------- + ---------- >>> from gensim.models import RpModel >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import common_texts From 8132f5171d370fcd72418b0192655a740fc69cbc Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 11:23:05 +0500 Subject: [PATCH 12/14] fix docstrings & PEP8 --- gensim/models/rpmodel.py | 98 +++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 51 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 3bce7016f8..a7e6475595 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -5,37 +5,32 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Objects of this class allow building and maintaining a model for Random Projections -(also known as Random Indexing). - -For theoretical background on RP, see [1]_. -: Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis." - - The main methods are: - - 1. constructor, which creates the random projection matrix - 2. the [] method, which transforms a simple count representation into the TfIdf - space. +"""Random Projections (also known as Random Indexing). -Model persistency is achieved via its load/save methods. +For theoretical background on Random Projections, see [1]_. Examples: --------- >>> from gensim.models import RpModel >>> from gensim.corpora import Dictionary ->>> from gensim.test.utils import common_texts ->>> dictionary = Dictionary(common_texts) ->>> corpus = [dictionary.doc2bow(text) for text in common_texts] ->>> rp = RpModel(corpus) ->>> some_doc = dictionary.doc2bow(common_texts[0]) ->>> print(rp[some_doc]) ->>> rp.save('/tmp/foo.rp_model') - - -.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis, - https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf +>>> from gensim.test.utils import common_texts, temporary_file +>>> +>>> dictionary = Dictionary(common_texts) # fit dictionary +>>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format +>>> +>>> model = RpModel(corpus, id2word=dictionary) # fit model +>>> result = model[corpus[3]] # apply model to document, result is vector in BoW format +>>> +>>> with temporary_file("model_file") as fname: +... model.save(fname) # save model to file +... loaded_model = RpModel.load(fname) # load model + + +References +---------- +.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis, + https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf """ @@ -53,21 +48,17 @@ class RpModel(interfaces.TransformationABC): def __init__(self, corpus, id2word=None, num_topics=300): """ - `id2word` is a mapping from word ids (integers) to words (strings). It is - used to determine the vocabulary size, as well as for debugging and topic - printing. If not set, it will be determined from the corpus. - Parameters ---------- - corpus : :class:`~gensim.interfaces.CorpusABC` - Iterable of documents + corpus : iterable of iterable of (int, int) + Input corpus. - id2word : dict of (int, string) - Mapping from word ids (integers) to words (strings) + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + Mapping `token_id` -> `token`, will be determine from corpus if `id2word == None`. - num_topics : int - Number of topics + num_topics : int, optional + Number of topics. """ self.id2word = id2word @@ -79,14 +70,12 @@ def __str__(self): return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) def initialize(self, corpus): - """ - Initialize the random projection matrix. - + """Initialize the random projection matrix. Parameters ---------- - corpus : :class:`~interfaces.CorpusABC` - Iterable of documents + corpus : iterable of iterable of (int, int) + Input corpus. """ if self.id2word is None: @@ -109,24 +98,31 @@ def initialize(self, corpus): # are smarter and this is no longer needed? def __getitem__(self, bow): - """ - Return RP representation of the input vector and/or corpus. + """Get random-projection representation of the input vector or corpus. Parameters ---------- - bow : :class:`~interfaces.CorpusABC` - Input document is an iterable of documents or list of (int, int) + bow : {list of (int, int), iterable of list of (int, int)} + Input document or corpus. + + Returns + ------- + list of (int, float) + if `bow` is document OR + :class:`~gensim.interfaces.TransformedCorpus` + if `bow` is corpus. Examples ---------- >>> from gensim.models import RpModel >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import common_texts - >>> dictionary = Dictionary(common_texts) - >>> corpus = [dictionary.doc2bow(text) for text in common_texts] - >>> rp = RpModel(corpus) - >>> some_doc = dictionary.doc2bow(common_texts[0]) - >>> print(rp[some_doc]) + >>> + >>> dictionary = Dictionary(common_texts) # fit dictionary + >>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format + >>> + >>> model = RpModel(corpus, id2word=dictionary) # fit model + >>> result = model[corpus[0]] # apply model to document, result is vector in BoW format, i.e. [(1, 0.3), ... ] """ # if the input vector is in fact a corpus, return a transformed corpus as result @@ -149,13 +145,13 @@ def __getitem__(self, bow): ] def __setstate__(self, state): - """ - Sets the internal state and updates freshly_loaded to True. Called when unpicked. + """Sets the internal state and updates freshly_loaded to True, called when unpicked. Parameters ---------- state : dict - State of the class + State of the class, + """ self.__dict__ = state self.freshly_loaded = True From a4b332ce0587754125299021ceefd4331ba80fc6 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 27 Dec 2017 11:23:36 +0500 Subject: [PATCH 13/14] fix docstrings --- gensim/models/rpmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index a7e6475595..edcbca8201 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -10,8 +10,8 @@ For theoretical background on Random Projections, see [1]_. -Examples: ---------- +Examples +-------- >>> from gensim.models import RpModel >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import common_texts, temporary_file From e9a1a2438fc3414307475f1241732f63e89f8741 Mon Sep 17 00:00:00 2001 From: Menshikh Ivan Date: Wed, 27 Dec 2017 12:13:59 +0500 Subject: [PATCH 14/14] fix typo --- gensim/models/rpmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index edcbca8201..0826a7c359 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -150,7 +150,7 @@ def __setstate__(self, state): Parameters ---------- state : dict - State of the class, + State of the class. """ self.__dict__ = state