Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Adding sklearn wrapper for LDA code #932

Merged
merged 48 commits into from
Jan 29, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
08f417c
adding basic sklearn wrapper for LDA code
AadityaJ Oct 10, 2016
61a6f8c
updating changelog
AadityaJ Oct 11, 2016
66be324
adding test case,adding id2word,deleting showtopics
AadityaJ Oct 16, 2016
cffa95b
adding relevant ipynb
AadityaJ Oct 16, 2016
10badc6
adding transfrom and other get methods and modifying print_topics
AadityaJ Oct 20, 2016
62a4d2f
stylizing code to follow conventions
AadityaJ Oct 21, 2016
b7eff2d
removing redundant default argumen values
AadityaJ Oct 21, 2016
2a193fd
adding partial_fit
AadityaJ Oct 23, 2016
a32f8dc
adding a line in test_sklearn_integration
AadityaJ Dec 9, 2016
a048ddc
using LDAModel as Parent Class
AadityaJ Dec 14, 2016
ac1d28e
adding docs, modifying getparam
AadityaJ Dec 18, 2016
0d6cc0a
changing class name.Adding comments
AadityaJ Dec 19, 2016
5d8c1a6
adding test case for update and transform
AadityaJ Dec 24, 2016
894784c
adding init
AadityaJ Dec 24, 2016
7a5ca4b
updating changes,fixed typo and changing file name
AadityaJ Dec 26, 2016
b35baba
deleted base.py
AadityaJ Dec 26, 2016
13a136d
adding better testPartialFit method and minor changes due to change i…
AadityaJ Dec 26, 2016
682f045
change name of test class
AadityaJ Dec 30, 2016
9fda951
adding changes in classname to ipynb
AadityaJ Dec 30, 2016
380ea5f
Merge branch 'develop' into sklearn_lda
AadityaJ Dec 30, 2016
e2485d4
Updating CHANGELOG.md
AadityaJ Dec 31, 2016
3015896
Updated Main Model. Added fit_predict to class for example
AadityaJ Dec 31, 2016
a76eda4
added sklearn countvectorizer example to ipynb
AadityaJ Dec 31, 2016
97c1530
adding logistic regression example
AadityaJ Jan 4, 2017
20a63ac
adding if condition for csr_matrix to ldamodel
AadityaJ Jan 4, 2017
c0b2c5c
adding check for fit csrmatrix also stylizing code
AadityaJ Jan 4, 2017
bd656a8
Merge branch 'develop' into sklearn_lda
AadityaJ Jan 5, 2017
d749ba0
minor bug.solved, fit should convert X to corpus
AadityaJ Jan 5, 2017
21119c5
removing fit_predict.adding csr_matrix check for update
AadityaJ Jan 6, 2017
14f984b
minor updates in ipynb
AadityaJ Jan 6, 2017
a3895b5
adding rst file
AadityaJ Jan 6, 2017
f832737
removed "basic" , added rst update to log
AadityaJ Jan 6, 2017
bc352a0
changing indentation in texts
AadityaJ Jan 6, 2017
7cc39da
added file preamble, removed unnecessary space
AadityaJ Jan 6, 2017
0ba233c
following more pep8 conventions
AadityaJ Jan 6, 2017
e23a8a4
removing unnecessary comments
AadityaJ Jan 6, 2017
041a32e
changing isinstance csr_matrix to issparse
AadityaJ Jan 7, 2017
e7120f0
changed to hanging indentation
AadityaJ Jan 8, 2017
8a0950d
changing main filename
AadityaJ Jan 8, 2017
bd8bced
changing module name in test
AadityaJ Jan 8, 2017
bb5872b
updating ipynb with main filename
AadityaJ Jan 8, 2017
777576e
changed class name
AadityaJ Jan 8, 2017
e50c3f9
changed file name
AadityaJ Jan 8, 2017
e521269
fixing filename typo
AadityaJ Jan 8, 2017
51931fa
adding html file
AadityaJ Jan 8, 2017
7ba30d6
deleting html file
AadityaJ Jan 8, 2017
82d1fdc
vertical indentation fixes
AadityaJ Jan 8, 2017
4f3441e
adding file to apiref.rst
AadityaJ Jan 10, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 20 additions & 18 deletions gensim/sklearn_integration/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class LdaModel(object):
Base LDA module
"""
def __init__(self, n_topics=5, n_iter=2000, alpha=0.1, eta=0.01, random_state=None,
refresh=10,lda_model=None,id2word=None,passes=20,ex=None):
refresh=10, lda_model=None, id2word=None, passes=20, ex=None):
"""
base LDA code . Uses mapper function
n_topics : num_topics
Expand All @@ -45,8 +45,8 @@ def __init__(self, n_topics=5, n_iter=2000, alpha=0.1, eta=0.01, random_state=No
self.eta = eta
self.random_state = random_state
self.refresh = refresh
self.id2word=id2word
self.passes=passes
self.id2word = id2word
self.passes = passes
# use lda_model variable as object
self.lda_model = lda_model
# perform appropriate checks
Expand All @@ -57,14 +57,15 @@ def __init__(self, n_topics=5, n_iter=2000, alpha=0.1, eta=0.01, random_state=No

def get_params(self, deep=True):
if deep:
return {"alpha": self.alpha, "n_iter": self.n_iter,"eta":self.eta,"random_state":self.random_state,"lda_model":self.lda_model,"id2word":self.id2word,"passes":self.passes}
return {"alpha": self.alpha, "n_iter": self.n_iter, "eta": self.eta, "random_state": self.random_state,
"lda_model": self.lda_model, "id2word": self.id2word, "passes": self.passes}

def set_params(self, **parameters):
for parameter, value in parameters.items():
self.setattr(parameter, value)
return self

def fit(self,X,y=None):
def fit(self, X, y=None):
"""
call gensim.model.LdaModel from this
// todo: convert fit and relevant,corpus still requires gensim preprocessing
Expand All @@ -73,36 +74,37 @@ def fit(self,X,y=None):
"""
if X is None:
raise AttributeError("Corpus defined as none")
self.lda_model = gensim.models.LdaModel(corpus=X,num_topics=self.n_topics, id2word=self.id2word, passes=self.passes,
update_every=self.refresh,alpha=self.alpha, iterations=self.n_iter,
eta=self.eta,random_state=self.random_state)
return self.lda_model
self.lda_model = gensim.models.LdaModel(
corpus=X, num_topics=self.n_topics, id2word=self.id2word, passes=self.passes,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indent too large (should be a single level).

update_every=self.refresh, alpha=self.alpha, iterations=self.n_iter,
eta=self.eta, random_state=self.random_state)
return self.lda_model

def print_topics(self,n_topics=20,num_words=20,log=True):
def print_topics(self, n_topics=20, num_words=20, log=True):
"""
print all the topics
using the object lda_model
"""
return self.lda_model.show_topics(num_topics=n_topics,num_words=num_words,log=log)
return self.lda_model.show_topics(num_topics=n_topics, num_words=num_words, log=log)

def transform(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
"""
takes as an input a new document (bow) and
Return topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
"""
return self.lda_model.get_document_topics(bow,minimum_probability=minimum_probability,minimum_phi_value=minimum_phi_value,
per_word_topics=per_word_topics)
# might need to do more
def get_term_topics(self,wordid,minimum_probability=None):
return self.lda_model.get_document_topics(bow, minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No vertical indent in gensim; we use hanging indent (see PEP8 for examples).


def get_term_topics(self, wordid, minimum_probability=None):
"""
returns the most likely topic associated with a particular word
use wordid or simply pass the word itself
"""
return self.lda_model.get_term_topics(wordid,minimum_probability=minimum_probability)
return self.lda_model.get_term_topics(wordid, minimum_probability=minimum_probability)

def get_topic_terms(self,topicid,topn=10):
def get_topic_terms(self, topicid, topn=10):
"""
return a tuple of (wordid,probability) for given topic
topn can be used to restrict
"""
return self.lda_model.get_topic_terms(topicid=topicid,topn=topn)
return self.lda_model.get_topic_terms(topicid=topicid, topn=topn)
4 changes: 2 additions & 2 deletions gensim/test/test_sklearn_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
corpus = [dictionary.doc2bow(text) for text in texts]


class TestLdaModel:
class TestLdaModel(object):
def __init__(self):
self.model=base.LdaModel(id2word=dictionary,n_topics=2,passes=100)
self.model.fit(corpus)
Expand All @@ -29,4 +29,4 @@ def testPrintTopic(self):
self.assertTrue(isinstance(v, float))

if __name__ == '__main__':
unittest.main()
unittest.main()