-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Adding sklearn wrapper for LDA code #932
Merged
Merged
Changes from 1 commit
Commits
Show all changes
48 commits
Select commit
Hold shift + click to select a range
08f417c
adding basic sklearn wrapper for LDA code
AadityaJ 61a6f8c
updating changelog
AadityaJ 66be324
adding test case,adding id2word,deleting showtopics
AadityaJ cffa95b
adding relevant ipynb
AadityaJ 10badc6
adding transfrom and other get methods and modifying print_topics
AadityaJ 62a4d2f
stylizing code to follow conventions
AadityaJ b7eff2d
removing redundant default argumen values
AadityaJ 2a193fd
adding partial_fit
AadityaJ a32f8dc
adding a line in test_sklearn_integration
AadityaJ a048ddc
using LDAModel as Parent Class
AadityaJ ac1d28e
adding docs, modifying getparam
AadityaJ 0d6cc0a
changing class name.Adding comments
AadityaJ 5d8c1a6
adding test case for update and transform
AadityaJ 894784c
adding init
AadityaJ 7a5ca4b
updating changes,fixed typo and changing file name
AadityaJ b35baba
deleted base.py
AadityaJ 13a136d
adding better testPartialFit method and minor changes due to change i…
AadityaJ 682f045
change name of test class
AadityaJ 9fda951
adding changes in classname to ipynb
AadityaJ 380ea5f
Merge branch 'develop' into sklearn_lda
AadityaJ e2485d4
Updating CHANGELOG.md
AadityaJ 3015896
Updated Main Model. Added fit_predict to class for example
AadityaJ a76eda4
added sklearn countvectorizer example to ipynb
AadityaJ 97c1530
adding logistic regression example
AadityaJ 20a63ac
adding if condition for csr_matrix to ldamodel
AadityaJ c0b2c5c
adding check for fit csrmatrix also stylizing code
AadityaJ bd656a8
Merge branch 'develop' into sklearn_lda
AadityaJ d749ba0
minor bug.solved, fit should convert X to corpus
AadityaJ 21119c5
removing fit_predict.adding csr_matrix check for update
AadityaJ 14f984b
minor updates in ipynb
AadityaJ a3895b5
adding rst file
AadityaJ f832737
removed "basic" , added rst update to log
AadityaJ bc352a0
changing indentation in texts
AadityaJ 7cc39da
added file preamble, removed unnecessary space
AadityaJ 0ba233c
following more pep8 conventions
AadityaJ e23a8a4
removing unnecessary comments
AadityaJ 041a32e
changing isinstance csr_matrix to issparse
AadityaJ e7120f0
changed to hanging indentation
AadityaJ 8a0950d
changing main filename
AadityaJ bd8bced
changing module name in test
AadityaJ bb5872b
updating ipynb with main filename
AadityaJ 777576e
changed class name
AadityaJ e50c3f9
changed file name
AadityaJ e521269
fixing filename typo
AadityaJ 51931fa
adding html file
AadityaJ 7ba30d6
deleting html file
AadityaJ 82d1fdc
vertical indentation fixes
AadityaJ 4f3441e
adding file to apiref.rst
AadityaJ File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,7 @@ class LdaModel(object): | |
Base LDA module | ||
""" | ||
def __init__(self, n_topics=5, n_iter=2000, alpha=0.1, eta=0.01, random_state=None, | ||
refresh=10,lda_model=None,id2word=None,passes=20,ex=None): | ||
refresh=10, lda_model=None, id2word=None, passes=20, ex=None): | ||
""" | ||
base LDA code . Uses mapper function | ||
n_topics : num_topics | ||
|
@@ -45,8 +45,8 @@ def __init__(self, n_topics=5, n_iter=2000, alpha=0.1, eta=0.01, random_state=No | |
self.eta = eta | ||
self.random_state = random_state | ||
self.refresh = refresh | ||
self.id2word=id2word | ||
self.passes=passes | ||
self.id2word = id2word | ||
self.passes = passes | ||
# use lda_model variable as object | ||
self.lda_model = lda_model | ||
# perform appropriate checks | ||
|
@@ -57,14 +57,15 @@ def __init__(self, n_topics=5, n_iter=2000, alpha=0.1, eta=0.01, random_state=No | |
|
||
def get_params(self, deep=True): | ||
if deep: | ||
return {"alpha": self.alpha, "n_iter": self.n_iter,"eta":self.eta,"random_state":self.random_state,"lda_model":self.lda_model,"id2word":self.id2word,"passes":self.passes} | ||
return {"alpha": self.alpha, "n_iter": self.n_iter, "eta": self.eta, "random_state": self.random_state, | ||
"lda_model": self.lda_model, "id2word": self.id2word, "passes": self.passes} | ||
|
||
def set_params(self, **parameters): | ||
for parameter, value in parameters.items(): | ||
self.setattr(parameter, value) | ||
return self | ||
|
||
def fit(self,X,y=None): | ||
def fit(self, X, y=None): | ||
""" | ||
call gensim.model.LdaModel from this | ||
// todo: convert fit and relevant,corpus still requires gensim preprocessing | ||
|
@@ -73,36 +74,37 @@ def fit(self,X,y=None): | |
""" | ||
if X is None: | ||
raise AttributeError("Corpus defined as none") | ||
self.lda_model = gensim.models.LdaModel(corpus=X,num_topics=self.n_topics, id2word=self.id2word, passes=self.passes, | ||
update_every=self.refresh,alpha=self.alpha, iterations=self.n_iter, | ||
eta=self.eta,random_state=self.random_state) | ||
return self.lda_model | ||
self.lda_model = gensim.models.LdaModel( | ||
corpus=X, num_topics=self.n_topics, id2word=self.id2word, passes=self.passes, | ||
update_every=self.refresh, alpha=self.alpha, iterations=self.n_iter, | ||
eta=self.eta, random_state=self.random_state) | ||
return self.lda_model | ||
|
||
def print_topics(self,n_topics=20,num_words=20,log=True): | ||
def print_topics(self, n_topics=20, num_words=20, log=True): | ||
""" | ||
print all the topics | ||
using the object lda_model | ||
""" | ||
return self.lda_model.show_topics(num_topics=n_topics,num_words=num_words,log=log) | ||
return self.lda_model.show_topics(num_topics=n_topics, num_words=num_words, log=log) | ||
|
||
def transform(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): | ||
""" | ||
takes as an input a new document (bow) and | ||
Return topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples. | ||
""" | ||
return self.lda_model.get_document_topics(bow,minimum_probability=minimum_probability,minimum_phi_value=minimum_phi_value, | ||
per_word_topics=per_word_topics) | ||
# might need to do more | ||
def get_term_topics(self,wordid,minimum_probability=None): | ||
return self.lda_model.get_document_topics(bow, minimum_probability=minimum_probability, | ||
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No vertical indent in gensim; we use hanging indent (see PEP8 for examples). |
||
|
||
def get_term_topics(self, wordid, minimum_probability=None): | ||
""" | ||
returns the most likely topic associated with a particular word | ||
use wordid or simply pass the word itself | ||
""" | ||
return self.lda_model.get_term_topics(wordid,minimum_probability=minimum_probability) | ||
return self.lda_model.get_term_topics(wordid, minimum_probability=minimum_probability) | ||
|
||
def get_topic_terms(self,topicid,topn=10): | ||
def get_topic_terms(self, topicid, topn=10): | ||
""" | ||
return a tuple of (wordid,probability) for given topic | ||
topn can be used to restrict | ||
""" | ||
return self.lda_model.get_topic_terms(topicid=topicid,topn=topn) | ||
return self.lda_model.get_topic_terms(topicid=topicid, topn=topn) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indent too large (should be a single level).