From 14c504967370f8bcf460b6d3549b75dbbaa10722 Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Thu, 23 Jun 2016 16:52:07 -0400 Subject: [PATCH 01/10] Added vowpalwabbit wrapper to notebook. --- docs/notebooks/topic_coherence_tutorial.ipynb | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb index b8487b2c30..6f8ae15519 100644 --- a/docs/notebooks/topic_coherence_tutorial.ipynb +++ b/docs/notebooks/topic_coherence_tutorial.ipynb @@ -38,6 +38,7 @@ "\n", "from gensim.models.coherencemodel import CoherenceModel\n", "from gensim.models.ldamodel import LdaModel\n", + "from gensim.models.wrappers import LdaVowpalWabbit\n", "from gensim.corpora.dictionary import Dictionary\n", "from numpy import array" ] @@ -632,6 +633,65 @@ "print badcm.get_coherence()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Support for wrappers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This API supports gensim's _ldavowpalwabbit_ wrapper as input parameter to `model`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model1 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=50)\n", + "model2 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cm1 = CoherenceModel(model=model1, corpus=corpus, coherence='u_mass')\n", + "cm2 = CoherenceModel(model=model2, corpus=corpus, coherence='u_mass')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-14.075813889\n", + "-15.1740896045\n" + ] + } + ], + "source": [ + "print cm1.get_coherence()\n", + "print cm2.get_coherence()" + ] + }, { "cell_type": "markdown", "metadata": {}, From d746c907825b378318657fcf643714895407abf4 Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Fri, 24 Jun 2016 02:44:46 -0400 Subject: [PATCH 02/10] ldamallet support added. Notebook updated. --- docs/notebooks/topic_coherence_tutorial.ipynb | 49 ++++++++++++++++++- gensim/models/coherencemodel.py | 11 ++++- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb index 6f8ae15519..9cf4b19607 100644 --- a/docs/notebooks/topic_coherence_tutorial.ipynb +++ b/docs/notebooks/topic_coherence_tutorial.ipynb @@ -38,7 +38,7 @@ "\n", "from gensim.models.coherencemodel import CoherenceModel\n", "from gensim.models.ldamodel import LdaModel\n", - "from gensim.models.wrappers import LdaVowpalWabbit\n", + "from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet\n", "from gensim.corpora.dictionary import Dictionary\n", "from numpy import array" ] @@ -644,7 +644,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This API supports gensim's _ldavowpalwabbit_ wrapper as input parameter to `model`." + "This API supports gensim's _ldavowpalwabbit_ and _ldamallet_ wrappers as input parameter to `model`." ] }, { @@ -692,6 +692,51 @@ "print cm2.get_coherence()" ] }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model1 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=50)\n", + "model2 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cm1 = CoherenceModel(model=model1, texts=texts, coherence='c_v')\n", + "cm2 = CoherenceModel(model=model2, texts=texts, coherence='c_v')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.581114877802\n", + "0.549865328265\n" + ] + } + ], + "source": [ + "print cm1.get_coherence()\n", + "print cm2.get_coherence()" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 8bfde8b082..7dfb406e4c 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -28,7 +28,7 @@ from gensim.matutils import argsort from gensim.utils import is_corpus from gensim.models.ldamodel import LdaModel -from gensim.models.wrappers import LdaVowpalWabbit +from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet logger = logging.getLogger(__name__) @@ -116,7 +116,7 @@ def __str__(self): def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" - topics = [] # FIXME : Meant to work for LDAModel, LdaVowpalWabbit right now. Make it work for others. + topics = [] if isinstance(self.model, LdaModel): for topic in self.model.state.get_lambda(): bestn = argsort(topic, topn=10, reverse=True) @@ -125,6 +125,13 @@ def _get_topics(self): for topic in self.model._get_topics(): bestn = argsort(topic, topn=10, reverse=True) topics.append(bestn) + elif isinstance(self.model, LdaMallet): + for topic in self.model.wordtopics: + bestn = argsort(topic, topn=10, reverse=True) + topics.append(bestn) + else: + raise ValueError("This topic model is not currently supported. Supported topic models are" + "LdaModel, LdaVowpalWabbit and LdaMallet.") return topics def get_coherence(self): From 6fd15bcbd0d1d41e33f4821befdefc7dd6c13d42 Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Fri, 24 Jun 2016 12:58:32 -0400 Subject: [PATCH 03/10] made minor changes to documentation and code in coherencemodel. --- gensim/models/coherencemodel.py | 40 +++++++++---------- .../direct_confirmation_measure.py | 1 + .../indirect_confirmation_measure.py | 6 ++- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 7dfb406e4c..0170dbbbd8 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -26,7 +26,7 @@ aggregation) from gensim.corpora import Dictionary from gensim.matutils import argsort -from gensim.utils import is_corpus +from gensim.utils import is_corpus, FakeDict from gensim.models.ldamodel import LdaModel from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet @@ -52,45 +52,43 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c """ Args: ---- - model : Pre-trained topic model. + model : Pre-trained topic model. Should be provided irrespective of which coherence measure is being used. texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator. corpus : Gensim document corpus. - dictionary : Gensim dictionary mapping of id word to create corpus. + dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. + If both are provided, dictionary will be used. coherence : Coherence measure to be used. Supported values are: - u_mass - c_v + 'u_mass' + 'c_v' + For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. + For 'c_v' texts should be provided. Corpus is not needed. """ if texts is None and corpus is None: raise ValueError("One of texts or corpus has to be provided.") + # Check if associated dictionary is provided. + if dictionary is None: + if isinstance(model.id2word, FakeDict): + raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" + " should be set as the associated dictionary.") + else: + self.dictionary = model.id2word + else: + self.dictionary = dictionary + # Check for correct inputs for u_mass coherence measure. if coherence == 'u_mass': if is_corpus(corpus)[0]: - if dictionary is None: - if model.id2word[0] == 0: - raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" - "should be set as the dictionary.") - else: - self.dictionary = model.id2word - else: - self.dictionary = dictionary self.corpus = corpus elif texts is not None: self.texts = texts - if dictionary is None: - self.dictionary = Dictionary(self.texts) - else: - self.dictionary = dictionary self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] else: raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) - + # Check for correct inputs for c_v coherence measure. elif coherence == 'c_v': if texts is None: raise ValueError("'texts' should be provided for %s coherence." % coherence) else: self.texts = texts - self.dictionary = Dictionary(self.texts) - self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] - else: raise ValueError("%s coherence is not currently supported." % coherence) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index eaa1b66841..9a783a472a 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -44,6 +44,7 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): def log_ratio_measure(segmented_topics, per_topic_postings, num_docs): """ + Popularly known as PMI. This function calculates the log-ratio-measure which is used by coherence measures such as c_v. This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index e41cb778f1..1af0dae8e8 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -59,10 +59,14 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam ---- topics : Topics obtained from the trained topic model. segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. + per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma : Gamma value for computing W', W* vectors. num_docs : Total number of documents in corresponding corpus. + + Returns: + ------- + s_cos_sim : array of cosine similarity of the context vectors for each segmentation """ if measure == 'nlr': measure = direct_confirmation_measure.normalized_log_ratio_measure From 8d9ad88cc98d71a5f2baf81bfb153fdc60ae728b Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Fri, 24 Jun 2016 17:19:04 -0400 Subject: [PATCH 04/10] Added test suite for coherencemodel and aggregation. --- gensim/test/test_aggregation.py | 28 +++++++++++ gensim/test/test_coherencemodel.py | 75 ++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 gensim/test/test_aggregation.py create mode 100644 gensim/test/test_coherencemodel.py diff --git a/gensim/test/test_aggregation.py b/gensim/test/test_aggregation.py new file mode 100644 index 0000000000..44e3d16f65 --- /dev/null +++ b/gensim/test/test_aggregation.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking transformation algorithms (the models package). +""" + +import logging +import unittest + +from gensim.topic_coherence import aggregation + +class TestAggregation(unittest.TestCase): + def setUp(self): + self.confirmed_measures = [1.1, 2.2, 3.3, 4.4] + + def testArithmeticMean(self): + """Test arithmetic_mean()""" + obtained = aggregation.arithmetic_mean(self.confirmed_measures) + expected = 2.75 + self.assertEqual(obtained, expected) + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py new file mode 100644 index 0000000000..9291d33825 --- /dev/null +++ b/gensim/test/test_coherencemodel.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking transformation algorithms (the models package). +""" + +import logging +import unittest +import os +import os.path +import tempfile + +import numpy as np + +from gensim.models.coherencemodel import CoherenceModel +from gensim.models.ldamodel import LdaModel +from gensim.models.wrappers import LdaMallet +from gensim.models.wrappers import LdaVowpalWabbit +from gensim.corpora.dictionary import Dictionary + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + +# set up vars used in testing ("Deerwester" from the web tutorial) +texts = [['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] +dictionary = Dictionary(texts) +corpus = [dictionary.doc2bow(text) for text in texts] + + +def testfile(): + # temporary data will be stored to this file + return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') + +class TestCoherenceModel(unittest.TestCase): + def setUp(self): + np.random.seed(8) + self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1) # Bad lda model + self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50) # Good lda model + + def testUMassLdaModel(self): + """Test U_Mass topic coherence algorithm on LDA Model""" + cm1 = CoherenceModel(model=self.badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass') + cm2 = CoherenceModel(model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass') + self.assertTrue(cm1.get_coherence() < cm2.get_coherence()) + + def testCvLdaModel(self): + """Test C_v topic coherence algorithm on LDA Model""" + cm1 = CoherenceModel(model=self.badLdaModel, texts=texts, dictionary=dictionary, coherence='c_v') + cm2 = CoherenceModel(model=self.goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v') + self.assertTrue(cm1.get_coherence() < cm2.get_coherence()) + + def testErrors(self): + """Test if errors are raised on bad input""" + # not providing dictionary + self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, coherence='u_mass') + # not providing texts for c_v and instead providing corpus + self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='c_v') + # not providing corpus or texts for u_mass + self.assertRaises(ValueError, CoherenceModel, self.goodLdaModel, dictionary, 'u_mass') + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From 45a5563ff1a6281fb2dbffe2df429704f7eb107e Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Tue, 28 Jun 2016 23:24:08 +0530 Subject: [PATCH 05/10] Added CHANGELOG entry. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30c025f67b..d20bd5efee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Changes - NOT BACKWARDS COMPATIBLE! * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113) * Implemented LsiModel.docs_processed attribute +* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation. (@dsquareindia, #750) 0.13.1, 2016-06-22 From 779a9338c8f3a60f13c2d7903925a815b48eceb7 Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Thu, 30 Jun 2016 00:19:13 +0530 Subject: [PATCH 06/10] use random_state instead of seed. --- gensim/test/test_coherencemodel.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 9291d33825..dba74bc9b4 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -45,9 +45,8 @@ def testfile(): class TestCoherenceModel(unittest.TestCase): def setUp(self): - np.random.seed(8) - self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1) # Bad lda model - self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50) # Good lda model + self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1, random_state=17) # Bad lda model + self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50, random_state=17) # Good lda model def testUMassLdaModel(self): """Test U_Mass topic coherence algorithm on LDA Model""" From 9f7e7ed4307be0413cccc6b81fab21fc4bfbfe7c Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Thu, 30 Jun 2016 10:04:41 +0530 Subject: [PATCH 07/10] added topics parameter to coherencemodel --- gensim/models/coherencemodel.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 0170dbbbd8..a9b3f5c973 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -30,6 +30,8 @@ from gensim.models.ldamodel import LdaModel from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet +import numpy as np + logger = logging.getLogger(__name__) @@ -48,11 +50,14 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c_v'): + def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, coherence='c_v'): """ Args: ---- - model : Pre-trained topic model. Should be provided irrespective of which coherence measure is being used. + model : Pre-trained topic model. Should be provided if topics is not provided. + topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. + eg. topics = [['human', 'machine', 'computer', 'interface'], + ['graph', 'trees', 'binary', 'widths']] texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator. corpus : Gensim document corpus. dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. @@ -63,6 +68,10 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. For 'c_v' texts should be provided. Corpus is not needed. """ + if model is None and topics is None: + raise ValueError("One of model or topics has to be provided.") + elif topics is not None and dictionary is None: + raise ValueError("dictionary has to be provided if topics are to be used.") if texts is None and corpus is None: raise ValueError("One of texts or corpus has to be provided.") # Check if associated dictionary is provided. @@ -93,7 +102,15 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c raise ValueError("%s coherence is not currently supported." % coherence) self.model = model - self.topics = self._get_topics() + if model is not None: + self.topics = self._get_topics() + elif topics is not None: + self.topics = [] + for topic in topics: + t_i = [] + for t in range(len(topic)): + t_i.append(dictionary.token2id[topic[t]]) + self.topics.append(np.array(t_i)) self.coherence = coherence # Set pipeline parameters: if self.coherence == 'u_mass': From 4ca9a2c49a02c01944007b1cb4c044ee43fb4cae Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Thu, 30 Jun 2016 10:21:35 +0530 Subject: [PATCH 08/10] added parameter change to CHANGELOG --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d20bd5efee..b042efafe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,8 @@ Changes - NOT BACKWARDS COMPATIBLE! * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113) * Implemented LsiModel.docs_processed attribute -* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation. (@dsquareindia, #750) +* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation. + Added `topics` parameter to coherencemodel. Can now provide tokenized topics to calculate coherence value (@dsquareindia, #750) 0.13.1, 2016-06-22 @@ -23,7 +24,7 @@ Changes * SparseMatrixSimilarity returns a sparse matrix if `maintain_sparsity` is True (@davechallis, #590) * added functionality for Topics of Words in document - i.e, dynamic topics. (@bhargavvader, #704) - also included tutorial which explains new functionalities, and document word-topic colring. -* Made normalization an explicit transformation. Added 'l1' norm support (@squareindia, #649) +* Made normalization an explicit transformation. Added 'l1' norm support (@dsquareindia, #649) * added term-topics API for most probable topic for word in vocab. (@bhargavvader, #706) * build_vocab takes progress_per parameter for smaller output (@zer0n, #624) * Control whether to use lowercase for computing word2vec accuracy. (@alantian, #607) From ad22da5219b3f14c8d604f0bc6bff0ece3b09cd7 Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Fri, 1 Jul 2016 16:26:04 +0530 Subject: [PATCH 09/10] Modified test. Changed wordtopics to word_topics. --- gensim/models/coherencemodel.py | 2 +- gensim/test/test_coherencemodel.py | 38 +++++++++++++++++------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index a9b3f5c973..2c60240b77 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -141,7 +141,7 @@ def _get_topics(self): bestn = argsort(topic, topn=10, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaMallet): - for topic in self.model.wordtopics: + for topic in self.model.word_topics: bestn = argsort(topic, topn=10, reverse=True) topics.append(bestn) else: diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index dba74bc9b4..075ecd56d9 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -14,8 +14,6 @@ import os.path import tempfile -import numpy as np - from gensim.models.coherencemodel import CoherenceModel from gensim.models.ldamodel import LdaModel from gensim.models.wrappers import LdaMallet @@ -45,29 +43,35 @@ def testfile(): class TestCoherenceModel(unittest.TestCase): def setUp(self): - self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1, random_state=17) # Bad lda model - self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50, random_state=17) # Good lda model + # Suppose given below are the topics which two different LdaModels come up with. + # `topics1` is clearly better as it has a clear distinction between system-human + # interaction and graphs. Hence both the coherence measures for `topics1` should be + # greater. + self.topics1 = [['human', 'computer', 'system', 'interface'], + ['graph', 'minors', 'trees', 'eps']] + self.topics2 = [['user', 'graph', 'minors', 'system'], + ['time', 'graph', 'survey', 'minors']] - def testUMassLdaModel(self): - """Test U_Mass topic coherence algorithm on LDA Model""" - cm1 = CoherenceModel(model=self.badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass') - cm2 = CoherenceModel(model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass') - self.assertTrue(cm1.get_coherence() < cm2.get_coherence()) + def testUMass(self): + """Test U_Mass topic coherence algorithm on given topics""" + cm1 = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + cm2 = CoherenceModel(topics=self.topics2, corpus=corpus, dictionary=dictionary, coherence='u_mass') + self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) - def testCvLdaModel(self): - """Test C_v topic coherence algorithm on LDA Model""" - cm1 = CoherenceModel(model=self.badLdaModel, texts=texts, dictionary=dictionary, coherence='c_v') - cm2 = CoherenceModel(model=self.goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v') - self.assertTrue(cm1.get_coherence() < cm2.get_coherence()) + def testCv(self): + """Test C_v topic coherence algorithm on given topics""" + cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_v') + cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v') + self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) def testErrors(self): """Test if errors are raised on bad input""" # not providing dictionary - self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, coherence='u_mass') + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, coherence='u_mass') # not providing texts for c_v and instead providing corpus - self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='c_v') + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='c_v') # not providing corpus or texts for u_mass - self.assertRaises(ValueError, CoherenceModel, self.goodLdaModel, dictionary, 'u_mass') + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, dictionary=dictionary, coherence='u_mass') if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From 336e10c434cc620265975304fd69d6f1d5987f83 Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Fri, 1 Jul 2016 23:42:19 +0530 Subject: [PATCH 10/10] Added topics param documentation to coherencemodel. Full test suite for coherencemodel added. --- gensim/models/coherencemodel.py | 11 ++++- gensim/test/test_coherencemodel.py | 79 +++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 2c60240b77..615e4efacc 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -24,7 +24,6 @@ from gensim.topic_coherence import (segmentation, probability_estimation, direct_confirmation_measure, indirect_confirmation_measure, aggregation) -from gensim.corpora import Dictionary from gensim.matutils import argsort from gensim.utils import is_corpus, FakeDict from gensim.models.ldamodel import LdaModel @@ -45,9 +44,17 @@ class CoherenceModel(interfaces.TransformationABC): 1. constructor, which initializes the four stage pipeline by accepting a coherence measure, 2. the ``get_coherence()`` method, which returns the topic coherence. + One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly + provided if the model does not contain a dictionary already. >>> cm = CoherenceModel(model=tm, corpus=corpus, coherence='u_mass') # tm is the trained topic model >>> cm.get_coherence() + Another way of using this feature is through providing tokenized topics such as: + >>> topics = [['human', 'computer', 'system', 'interface'], + ['graph', 'minors', 'trees', 'eps']] + >>> cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided. + >>> cm.get_coherence() + Model persistency is achieved via its load/save methods. """ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, coherence='c_v'): @@ -57,7 +64,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= model : Pre-trained topic model. Should be provided if topics is not provided. topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. eg. topics = [['human', 'machine', 'computer', 'interface'], - ['graph', 'trees', 'binary', 'widths']] + ['graph', 'trees', 'binary', 'widths']] texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator. corpus : Gensim document corpus. dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 075ecd56d9..057f73d01d 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -20,7 +20,7 @@ from gensim.models.wrappers import LdaVowpalWabbit from gensim.corpora.dictionary import Dictionary -module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) # set up vars used in testing ("Deerwester" from the web tutorial) @@ -51,6 +51,19 @@ def setUp(self): ['graph', 'minors', 'trees', 'eps']] self.topics2 = [['user', 'graph', 'minors', 'system'], ['time', 'graph', 'survey', 'minors']] + self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0) + mallet_home = os.environ.get('MALLET_HOME', None) + self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None + if self.mallet_path: + self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0) + vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) + if not vw_path: + msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" + logging.info(msg) + self.vw_path = None + else: + self.vw_path = vw_path + self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0) def testUMass(self): """Test U_Mass topic coherence algorithm on given topics""" @@ -64,6 +77,56 @@ def testCv(self): cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v') self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + def testUMassLdaModel(self): + """Perform sanity check to see if u_mass coherence works with LDA Model""" + try: + cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass') + except: + raise + + def testCvLdaModel(self): + """Perform sanity check to see if c_v coherence works with LDA Model""" + try: + cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_v') + except: + raise + + def testUMassMalletModel(self): + """Perform sanity check to see if u_mass coherence works with LDA Mallet gensim wrapper""" + if not self.mallet_path: + return + try: + cm = CoherenceModel(model=self.malletmodel, corpus=corpus, coherence='u_mass') + except: + raise + + def testCvMalletModel(self): + """Perform sanity check to see if c_v coherence works with LDA Mallet gensim wrapper""" + if not self.mallet_path: + return + try: + cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_v') + except: + raise + + def testUMassVWModel(self): + """Perform sanity check to see if u_mass coherence works with LDA VW gensim wrapper""" + if not self.vw_path: + return + try: + cm = CoherenceModel(model=self.vwmodel, corpus=corpus, coherence='u_mass') + except: + raise + + def testCvVWModel(self): + """Perform sanity check to see if c_v coherence works with LDA VW gensim wrapper""" + if not self.vw_path: + return + try: + cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_v') + except: + raise + def testErrors(self): """Test if errors are raised on bad input""" # not providing dictionary @@ -73,6 +136,20 @@ def testErrors(self): # not providing corpus or texts for u_mass self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, dictionary=dictionary, coherence='u_mass') + def testPersistence(self): + fname = testfile() + model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + model.save(fname) + model2 = CoherenceModel.load(fname) + self.assertTrue(model.get_coherence() == model2.get_coherence()) + + def testPersistenceCompressed(self): + fname = testfile() + '.gz' + model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + model.save(fname) + model2 = CoherenceModel.load(fname) + self.assertTrue(model.get_coherence() == model2.get_coherence()) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()