Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Changes in sklearn wrappers for LDA and LSI models #1398

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
173fe1f
updated LSI wrapper
chinmayapancholi13 Jun 7, 2017
d86d52d
updated LDA wrapper
chinmayapancholi13 Jun 7, 2017
1344aa1
corection in 'init' call in 'fit'
chinmayapancholi13 Jun 7, 2017
c8f8fd4
removed 'corpus' param from 'init' in LDA model
chinmayapancholi13 Jun 8, 2017
ff6aab1
removed 'corpus' param from 'init' in LSI model
chinmayapancholi13 Jun 8, 2017
61eb5cb
changed docstring for 'fit' method
chinmayapancholi13 Jun 8, 2017
3c0174c
refactored code for LDA and LSI wrappers
chinmayapancholi13 Jun 14, 2017
9e4d29b
replaced 'self.model' by 'self.gensim_model'
chinmayapancholi13 Jun 15, 2017
b13669a
updated 'testCSRMatrixConversion' test
chinmayapancholi13 Jun 15, 2017
7d557bf
updated 'self.__model' to 'self.gensim_model' for LSI wrapper
chinmayapancholi13 Jun 15, 2017
018acc0
fixed 'testTransform' test for LDA and LSI
chinmayapancholi13 Jun 15, 2017
1583645
updated 'transform' and 'partial_fit' functions
chinmayapancholi13 Jun 16, 2017
9ed7ac9
added 'testPersistence' and 'testModelNotFitted' tests
chinmayapancholi13 Jun 16, 2017
e303a38
added newline at end of files
chinmayapancholi13 Jun 16, 2017
7b05a61
added example for 'docs' for 'transform' function in docstring
chinmayapancholi13 Jun 16, 2017
04714b6
replaced 'text_lda' variable with 'text_lsi'
chinmayapancholi13 Jun 18, 2017
635d9a4
updated 'testPersistence' test for LDA and LSI models
chinmayapancholi13 Jun 19, 2017
9c8ac42
updated 'testPartialFit' tests
chinmayapancholi13 Jun 19, 2017
dd31014
set fixed seed for LDA and LSI model tests
chinmayapancholi13 Jun 19, 2017
520bd75
updated 'testPartialFit' test for LDA and LSI models
chinmayapancholi13 Jun 19, 2017
34a6d14
Merge branch 'develop' into lda_lsi_wrapper_changes
menshikh-iv Jun 20, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 33 additions & 39 deletions gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,37 @@
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
#

"""
Scikit learn interface for gensim for easy use of gensim with scikit-learn
follows on scikit learn API conventions
"""

import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models
from gensim import matutils
from gensim.sklearn_integration import base_sklearn_wrapper
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator


class SklearnWrapperLdaModel(models.LdaModel, base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
class SklLdaModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
"""
Base LDA module
"""

def __init__(
self, corpus=None, num_topics=100, id2word=None,
self, num_topics=100, id2word=None,
chunksize=2000, passes=1, update_every=1,
alpha='symmetric', eta=None, decay=0.5, offset=1.0,
eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01, random_state=None):
"""
Sklearn wrapper for LDA model. derived class for gensim.model.LdaModel .
"""
self.corpus = corpus
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.chunksize = chunksize
Expand All @@ -46,82 +48,66 @@ def __init__(
self.gamma_threshold = gamma_threshold
self.minimum_probability = minimum_probability
self.random_state = random_state
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove corpus parameter from constructor (you pass a corpus only for fit* methods) in both models.

# if no fit function is used , then corpus is given in init
if self.corpus:
models.LdaModel.__init__(
self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations,
gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
random_state=self.random_state)

def get_params(self, deep=True):
"""
Returns all parameters as dictionary.
"""
return {"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
"chunksize": self.chunksize, "passes": self.passes,
"update_every": self.update_every, "alpha": self.alpha, "eta": self.eta, "decay": self.decay,
"offset": self.offset, "eval_every": self.eval_every, "iterations": self.iterations,
return {"num_topics": self.num_topics, "id2word": self.id2word, "chunksize": self.chunksize,
"passes": self.passes, "update_every": self.update_every, "alpha": self.alpha, "eta": self.eta,
"decay": self.decay, "offset": self.offset, "eval_every": self.eval_every, "iterations": self.iterations,
"gamma_threshold": self.gamma_threshold, "minimum_probability": self.minimum_probability,
"random_state": self.random_state}

def set_params(self, **parameters):
"""
Set all parameters.
"""
super(SklearnWrapperLdaModel, self).set_params(**parameters)
super(SklLdaModel, self).set_params(**parameters)

def fit(self, X, y=None):
"""
For fitting corpus into the class object.
Calls gensim.model.LdaModel:
>>> gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=passes, update_every=update_every, alpha=alpha, iterations=iterations, eta=eta, random_state=random_state)
Fit the model according to the given training data.
Calls gensim.models.LdaModel
"""
if sparse.issparse(X):
self.corpus = matutils.Sparse2Corpus(X)
corpus = matutils.Sparse2Corpus(X)
else:
self.corpus = X
corpus = X

models.LdaModel.__init__(
self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word,
self.gensim_model = models.LdaModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations,
gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
random_state=self.random_state)
return self

def transform(self, docs, minimum_probability=None):
def transform(self, docs):
"""
Takes as an list of input a documents (documents).
Returns matrix of topic distribution for the given document bow, where a_ij
indicates (topic_i, topic_probability_j).
The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ]
or a single document like : [(4, 1), (7, 1)]
"""
if self.gensim_model is None:
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for _ in range(0, len(docs))]

for k, v in enumerate(docs):
doc_topics = self.get_document_topics(v, minimum_probability=minimum_probability)
doc_topics = self.gensim_model[v]
probs_docs = list(map(lambda x: x[1], doc_topics))
# Everything should be equal in length
if len(probs_docs) != self.num_topics:
probs_docs.extend([1e-12]*(self.num_topics - len(probs_docs)))
X[k] = probs_docs
return np.reshape(np.array(X), (len(docs), self.num_topics))

def get_topic_dist(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
"""
Takes as an input a new document (bow).
Returns the topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
"""
return self.get_document_topics(
bow, minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics)

def partial_fit(self, X):
"""
Train model over X.
Expand All @@ -134,4 +120,12 @@ def partial_fit(self, X):
if sparse.issparse(X):
X = matutils.Sparse2Corpus(X)

self.update(corpus=X)
if self.gensim_model is None:
self.gensim_model = models.LdaModel(num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold,
minimum_probability=self.minimum_probability, random_state=self.random_state)

self.gensim_model.update(corpus=X)
return self
51 changes: 29 additions & 22 deletions gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,33 @@
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
#

"""
Scikit learn interface for gensim for easy use of gensim with scikit-learn
Follows scikit-learn API conventions
"""

import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models
from gensim import matutils
from gensim.sklearn_integration import base_sklearn_wrapper
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator


class SklearnWrapperLsiModel(models.LsiModel, base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
class SklLsiModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator):
"""
Base LSI module
"""

def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
def __init__(self, num_topics=200, id2word=None, chunksize=20000,
decay=1.0, onepass=True, power_iters=2, extra_samples=100):
"""
Sklearn wrapper for LSI model. Class derived from gensim.model.LsiModel.
"""
self.corpus = corpus
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.chunksize = chunksize
Expand All @@ -36,52 +38,51 @@ def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
self.extra_samples = extra_samples
self.power_iters = power_iters

# if 'fit' function is not used, then 'corpus' is given in init
if self.corpus:
models.LsiModel.__init__(self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)

def get_params(self, deep=True):
"""
Returns all parameters as dictionary.
"""
return {"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
return {"num_topics": self.num_topics, "id2word": self.id2word,
"chunksize": self.chunksize, "decay": self.decay, "onepass": self.onepass,
"extra_samples": self.extra_samples, "power_iters": self.power_iters}

def set_params(self, **parameters):
"""
Set all parameters.
"""
super(SklearnWrapperLsiModel, self).set_params(**parameters)
super(SklLsiModel, self).set_params(**parameters)

def fit(self, X, y=None):
"""
For fitting corpus into the class object.
Calls gensim.model.LsiModel:
>>>gensim.models.LsiModel(corpus=corpus, num_topics=num_topics, id2word=id2word, chunksize=chunksize, decay=decay, onepass=onepass, power_iters=power_iters, extra_samples=extra_samples)
Fit the model according to the given training data.
Calls gensim.models.LsiModel
"""
if sparse.issparse(X):
self.corpus = matutils.Sparse2Corpus(X)
corpus = matutils.Sparse2Corpus(X)
else:
self.corpus = X
corpus = X

models.LsiModel.__init__(self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)
self.gensim_model = models.LsiModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)
return self

def transform(self, docs):
"""
Takes a list of documents as input ('docs').
Returns a matrix of topic distribution for the given document bow, where a_ij
indicates (topic_i, topic_probability_j).
The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ]
or a single document like : [(4, 1), (7, 1)]
"""
if self.gensim_model is None:
raise NotFittedError("This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method.")

# The input as array of array
check = lambda x: [x] if isinstance(x[0], tuple) else x
docs = check(docs)
X = [[] for i in range(0,len(docs))];
for k,v in enumerate(docs):
doc_topics = self[v]
doc_topics = self.gensim_model[v]
probs_docs = list(map(lambda x: x[1], doc_topics))
# Everything should be equal in length
if len(probs_docs) != self.num_topics:
Expand All @@ -96,4 +97,10 @@ def partial_fit(self, X):
"""
if sparse.issparse(X):
X = matutils.Sparse2Corpus(X)
self.add_documents(corpus=X)

if self.gensim_model is None:
self.gensim_model = models.LsiModel(num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples)

self.gensim_model.add_documents(corpus=X)
return self
Loading