From 14c504967370f8bcf460b6d3549b75dbbaa10722 Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Thu, 23 Jun 2016 16:52:07 -0400
Subject: [PATCH 01/10] Added vowpalwabbit wrapper to notebook.

---
 docs/notebooks/topic_coherence_tutorial.ipynb | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb
index b8487b2c30..6f8ae15519 100644
--- a/docs/notebooks/topic_coherence_tutorial.ipynb
+++ b/docs/notebooks/topic_coherence_tutorial.ipynb
@@ -38,6 +38,7 @@
     "\n",
     "from gensim.models.coherencemodel import CoherenceModel\n",
     "from gensim.models.ldamodel import LdaModel\n",
+    "from gensim.models.wrappers import LdaVowpalWabbit\n",
     "from gensim.corpora.dictionary import Dictionary\n",
     "from numpy import array"
    ]
@@ -632,6 +633,65 @@
     "print badcm.get_coherence()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Support for wrappers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This API supports gensim's _ldavowpalwabbit_ wrapper as input parameter to `model`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model1 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=50)\n",
+    "model2 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "cm1 = CoherenceModel(model=model1, corpus=corpus, coherence='u_mass')\n",
+    "cm2 = CoherenceModel(model=model2, corpus=corpus, coherence='u_mass')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-14.075813889\n",
+      "-15.1740896045\n"
+     ]
+    }
+   ],
+   "source": [
+    "print cm1.get_coherence()\n",
+    "print cm2.get_coherence()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From d746c907825b378318657fcf643714895407abf4 Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Fri, 24 Jun 2016 02:44:46 -0400
Subject: [PATCH 02/10] ldamallet support added. Notebook updated.

---
 docs/notebooks/topic_coherence_tutorial.ipynb | 49 ++++++++++++++++++-
 gensim/models/coherencemodel.py               | 11 ++++-
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb
index 6f8ae15519..9cf4b19607 100644
--- a/docs/notebooks/topic_coherence_tutorial.ipynb
+++ b/docs/notebooks/topic_coherence_tutorial.ipynb
@@ -38,7 +38,7 @@
     "\n",
     "from gensim.models.coherencemodel import CoherenceModel\n",
     "from gensim.models.ldamodel import LdaModel\n",
-    "from gensim.models.wrappers import LdaVowpalWabbit\n",
+    "from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet\n",
     "from gensim.corpora.dictionary import Dictionary\n",
     "from numpy import array"
    ]
@@ -644,7 +644,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This API supports gensim's _ldavowpalwabbit_ wrapper as input parameter to `model`."
+    "This API supports gensim's _ldavowpalwabbit_ and _ldamallet_ wrappers as input parameter to `model`."
    ]
   },
   {
@@ -692,6 +692,51 @@
     "print cm2.get_coherence()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model1 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=50)\n",
+    "model2 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "cm1 = CoherenceModel(model=model1, texts=texts, coherence='c_v')\n",
+    "cm2 = CoherenceModel(model=model2, texts=texts, coherence='c_v')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.581114877802\n",
+      "0.549865328265\n"
+     ]
+    }
+   ],
+   "source": [
+    "print cm1.get_coherence()\n",
+    "print cm2.get_coherence()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 8bfde8b082..7dfb406e4c 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -28,7 +28,7 @@
 from gensim.matutils import argsort
 from gensim.utils import is_corpus
 from gensim.models.ldamodel import LdaModel
-from gensim.models.wrappers import LdaVowpalWabbit
+from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
 
 logger = logging.getLogger(__name__)
 
@@ -116,7 +116,7 @@ def __str__(self):
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
-        topics = []  # FIXME : Meant to work for LDAModel, LdaVowpalWabbit right now. Make it work for others.
+        topics = []
         if isinstance(self.model, LdaModel):
             for topic in self.model.state.get_lambda():
                 bestn = argsort(topic, topn=10, reverse=True)
@@ -125,6 +125,13 @@ def _get_topics(self):
             for topic in self.model._get_topics():
                 bestn = argsort(topic, topn=10, reverse=True)
                 topics.append(bestn)
+        elif isinstance(self.model, LdaMallet):
+            for topic in self.model.wordtopics:
+                bestn = argsort(topic, topn=10, reverse=True)
+                topics.append(bestn)
+        else:
+            raise ValueError("This topic model is not currently supported. Supported topic models are"
+                             "LdaModel, LdaVowpalWabbit and LdaMallet.")
         return topics
 
     def get_coherence(self):

From 6fd15bcbd0d1d41e33f4821befdefc7dd6c13d42 Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Fri, 24 Jun 2016 12:58:32 -0400
Subject: [PATCH 03/10] made minor changes to documentation and code in
 coherencemodel.

---
 gensim/models/coherencemodel.py               | 40 +++++++++----------
 .../direct_confirmation_measure.py            |  1 +
 .../indirect_confirmation_measure.py          |  6 ++-
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 7dfb406e4c..0170dbbbd8 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -26,7 +26,7 @@
                                     aggregation)
 from gensim.corpora import Dictionary
 from gensim.matutils import argsort
-from gensim.utils import is_corpus
+from gensim.utils import is_corpus, FakeDict
 from gensim.models.ldamodel import LdaModel
 from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
 
@@ -52,45 +52,43 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c
         """
         Args:
         ----
-        model : Pre-trained topic model.
+        model : Pre-trained topic model. Should be provided irrespective of which coherence measure is being used.
         texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator.
         corpus : Gensim document corpus.
-        dictionary : Gensim dictionary mapping of id word to create corpus.
+        dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
+                     If both are provided, dictionary will be used.
         coherence : Coherence measure to be used. Supported values are:
-                    u_mass
-                    c_v
+                    'u_mass'
+                    'c_v'
+                    For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
+                    For 'c_v' texts should be provided. Corpus is not needed.
         """
         if texts is None and corpus is None:
             raise ValueError("One of texts or corpus has to be provided.")
+        # Check if associated dictionary is provided.
+        if dictionary is None:
+            if isinstance(model.id2word, FakeDict):
+                raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
+                                 " should be set as the associated dictionary.")
+            else:
+                self.dictionary = model.id2word
+        else:
+            self.dictionary = dictionary
+        # Check for correct inputs for u_mass coherence measure.
         if coherence == 'u_mass':
             if is_corpus(corpus)[0]:
-                if dictionary is None:
-                    if model.id2word[0] == 0:
-                        raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
-                                         "should be set as the dictionary.")
-                    else:
-                        self.dictionary = model.id2word
-                else:
-                    self.dictionary = dictionary
                 self.corpus = corpus
             elif texts is not None:
                 self.texts = texts
-                if dictionary is None:
-                    self.dictionary = Dictionary(self.texts)
-                else:
-                    self.dictionary = dictionary
                 self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
             else:
                 raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)
-
+        # Check for correct inputs for c_v coherence measure.
         elif coherence == 'c_v':
             if texts is None:
                 raise ValueError("'texts' should be provided for %s coherence." % coherence)
             else:
                 self.texts = texts
-                self.dictionary = Dictionary(self.texts)
-                self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
-
         else:
             raise ValueError("%s coherence is not currently supported." % coherence)
 
diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
index eaa1b66841..9a783a472a 100644
--- a/gensim/topic_coherence/direct_confirmation_measure.py
+++ b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -44,6 +44,7 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
 
 def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
     """
+    Popularly known as PMI.
     This function calculates the log-ratio-measure which is used by
     coherence measures such as c_v.
     This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
index e41cb778f1..1af0dae8e8 100644
--- a/gensim/topic_coherence/indirect_confirmation_measure.py
+++ b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -59,10 +59,14 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
     ----
     topics : Topics obtained from the trained topic model.
     segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
-    per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
+    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics.
     measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
     gamma : Gamma value for computing W', W* vectors.
     num_docs : Total number of documents in corresponding corpus.
+
+    Returns:
+    -------
+    s_cos_sim : array of cosine similarity of the context vectors for each segmentation
     """
     if measure == 'nlr':
         measure = direct_confirmation_measure.normalized_log_ratio_measure

From 8d9ad88cc98d71a5f2baf81bfb153fdc60ae728b Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Fri, 24 Jun 2016 17:19:04 -0400
Subject: [PATCH 04/10] Added test suite for coherencemodel and aggregation.

---
 gensim/test/test_aggregation.py    | 28 +++++++++++
 gensim/test/test_coherencemodel.py | 75 ++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 gensim/test/test_aggregation.py
 create mode 100644 gensim/test/test_coherencemodel.py

diff --git a/gensim/test/test_aggregation.py b/gensim/test/test_aggregation.py
new file mode 100644
index 0000000000..44e3d16f65
--- /dev/null
+++ b/gensim/test/test_aggregation.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking transformation algorithms (the models package).
+"""
+
+import logging
+import unittest
+
+from gensim.topic_coherence import aggregation
+
+class TestAggregation(unittest.TestCase):
+    def setUp(self):
+        self.confirmed_measures = [1.1, 2.2, 3.3, 4.4]
+
+    def testArithmeticMean(self):
+        """Test arithmetic_mean()"""
+        obtained = aggregation.arithmetic_mean(self.confirmed_measures)
+        expected = 2.75
+        self.assertEqual(obtained, expected)
+
+if __name__ == '__main__':
+    logging.root.setLevel(logging.WARNING)
+    unittest.main()
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
new file mode 100644
index 0000000000..9291d33825
--- /dev/null
+++ b/gensim/test/test_coherencemodel.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking transformation algorithms (the models package).
+"""
+
+import logging
+import unittest
+import os
+import os.path
+import tempfile
+
+import numpy as np
+
+from gensim.models.coherencemodel import CoherenceModel
+from gensim.models.ldamodel import LdaModel
+from gensim.models.wrappers import LdaMallet
+from gensim.models.wrappers import LdaVowpalWabbit
+from gensim.corpora.dictionary import Dictionary
+
+module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
+datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+
+# set up vars used in testing ("Deerwester" from the web tutorial)
+texts = [['human', 'interface', 'computer'],
+         ['survey', 'user', 'computer', 'system', 'response', 'time'],
+         ['eps', 'user', 'interface', 'system'],
+         ['system', 'human', 'system', 'eps'],
+         ['user', 'response', 'time'],
+         ['trees'],
+         ['graph', 'trees'],
+         ['graph', 'minors', 'trees'],
+         ['graph', 'minors', 'survey']]
+dictionary = Dictionary(texts)
+corpus = [dictionary.doc2bow(text) for text in texts]
+
+
+def testfile():
+    # temporary data will be stored to this file
+    return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
+
+class TestCoherenceModel(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(8)
+        self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1)  # Bad lda model
+        self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50)  # Good lda model
+
+    def testUMassLdaModel(self):
+        """Test U_Mass topic coherence algorithm on LDA Model"""
+        cm1 = CoherenceModel(model=self.badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        cm2 = CoherenceModel(model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        self.assertTrue(cm1.get_coherence() < cm2.get_coherence())
+
+    def testCvLdaModel(self):
+        """Test C_v topic coherence algorithm on LDA Model"""
+        cm1 = CoherenceModel(model=self.badLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')
+        cm2 = CoherenceModel(model=self.goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')
+        self.assertTrue(cm1.get_coherence() < cm2.get_coherence())
+
+    def testErrors(self):
+        """Test if errors are raised on bad input"""
+        # not providing dictionary
+        self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, coherence='u_mass')
+        # not providing texts for c_v and instead providing corpus
+        self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='c_v')
+        # not providing corpus or texts for u_mass
+        self.assertRaises(ValueError, CoherenceModel, self.goodLdaModel, dictionary, 'u_mass')
+
+if __name__ == '__main__':
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
+    unittest.main()

From 45a5563ff1a6281fb2dbffe2df429704f7eb107e Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Tue, 28 Jun 2016 23:24:08 +0530
Subject: [PATCH 05/10] Added CHANGELOG entry.

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 30c025f67b..d20bd5efee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Changes
   - NOT BACKWARDS COMPATIBLE!
 * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113)
 * Implemented LsiModel.docs_processed attribute
+* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation. (@dsquareindia, #750)
 
 0.13.1, 2016-06-22
 

From 779a9338c8f3a60f13c2d7903925a815b48eceb7 Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Thu, 30 Jun 2016 00:19:13 +0530
Subject: [PATCH 06/10] use random_state instead of seed.

---
 gensim/test/test_coherencemodel.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 9291d33825..dba74bc9b4 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -45,9 +45,8 @@ def testfile():
 
 class TestCoherenceModel(unittest.TestCase):
     def setUp(self):
-        np.random.seed(8)
-        self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1)  # Bad lda model
-        self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50)  # Good lda model
+        self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1, random_state=17)  # Bad lda model
+        self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50, random_state=17)  # Good lda model
 
     def testUMassLdaModel(self):
         """Test U_Mass topic coherence algorithm on LDA Model"""

From 9f7e7ed4307be0413cccc6b81fab21fc4bfbfe7c Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Thu, 30 Jun 2016 10:04:41 +0530
Subject: [PATCH 07/10] added topics parameter to coherencemodel

---
 gensim/models/coherencemodel.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 0170dbbbd8..a9b3f5c973 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -30,6 +30,8 @@
 from gensim.models.ldamodel import LdaModel
 from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
 
+import numpy as np
+
 logger = logging.getLogger(__name__)
 
 
@@ -48,11 +50,14 @@ class CoherenceModel(interfaces.TransformationABC):
 
     Model persistency is achieved via its load/save methods.
     """
-    def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c_v'):
+    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, coherence='c_v'):
         """
         Args:
         ----
-        model : Pre-trained topic model. Should be provided irrespective of which coherence measure is being used.
+        model : Pre-trained topic model. Should be provided if topics is not provided.
+        topics : List of tokenized topics. If this is preferred over model, dictionary should be provided.
+                 eg. topics = [['human', 'machine', 'computer', 'interface'],
+                                ['graph', 'trees', 'binary', 'widths']]
         texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator.
         corpus : Gensim document corpus.
         dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
@@ -63,6 +68,10 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c
                     For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
                     For 'c_v' texts should be provided. Corpus is not needed.
         """
+        if model is None and topics is None:
+            raise ValueError("One of model or topics has to be provided.")
+        elif topics is not None and dictionary is None:
+            raise ValueError("dictionary has to be provided if topics are to be used.")
         if texts is None and corpus is None:
             raise ValueError("One of texts or corpus has to be provided.")
         # Check if associated dictionary is provided.
@@ -93,7 +102,15 @@ def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c
             raise ValueError("%s coherence is not currently supported." % coherence)
 
         self.model = model
-        self.topics = self._get_topics()
+        if model is not None:
+            self.topics = self._get_topics()
+        elif topics is not None:
+            self.topics = []
+            for topic in topics:
+                t_i = []
+                for t in range(len(topic)):
+                    t_i.append(dictionary.token2id[topic[t]])
+                self.topics.append(np.array(t_i))
         self.coherence = coherence
         # Set pipeline parameters:
         if self.coherence == 'u_mass':

From 4ca9a2c49a02c01944007b1cb4c044ee43fb4cae Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Thu, 30 Jun 2016 10:21:35 +0530
Subject: [PATCH 08/10] added parameter change to CHANGELOG

---
 CHANGELOG.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d20bd5efee..b042efafe2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,8 @@ Changes
   - NOT BACKWARDS COMPATIBLE!
 * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113)
 * Implemented LsiModel.docs_processed attribute
-* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation. (@dsquareindia, #750)
+* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation.
+  Added `topics` parameter to coherencemodel. Can now provide tokenized topics to calculate coherence value (@dsquareindia, #750)
 
 0.13.1, 2016-06-22
 
@@ -23,7 +24,7 @@ Changes
 * SparseMatrixSimilarity returns a sparse matrix if `maintain_sparsity` is True (@davechallis, #590)
 * added functionality for Topics of Words in document - i.e, dynamic topics. (@bhargavvader, #704)
   - also included tutorial which explains new functionalities, and document word-topic colring.
-* Made normalization an explicit transformation. Added 'l1' norm support (@squareindia, #649)
+* Made normalization an explicit transformation. Added 'l1' norm support (@dsquareindia, #649)
 * added term-topics API for most probable topic for word in vocab. (@bhargavvader, #706)
 * build_vocab takes progress_per parameter for smaller output (@zer0n, #624)
 * Control whether to use lowercase for computing word2vec accuracy. (@alantian, #607)

From ad22da5219b3f14c8d604f0bc6bff0ece3b09cd7 Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Fri, 1 Jul 2016 16:26:04 +0530
Subject: [PATCH 09/10] Modified test. Changed wordtopics to word_topics.

---
 gensim/models/coherencemodel.py    |  2 +-
 gensim/test/test_coherencemodel.py | 38 +++++++++++++++++-------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index a9b3f5c973..2c60240b77 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -141,7 +141,7 @@ def _get_topics(self):
                 bestn = argsort(topic, topn=10, reverse=True)
                 topics.append(bestn)
         elif isinstance(self.model, LdaMallet):
-            for topic in self.model.wordtopics:
+            for topic in self.model.word_topics:
                 bestn = argsort(topic, topn=10, reverse=True)
                 topics.append(bestn)
         else:
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index dba74bc9b4..075ecd56d9 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -14,8 +14,6 @@
 import os.path
 import tempfile
 
-import numpy as np
-
 from gensim.models.coherencemodel import CoherenceModel
 from gensim.models.ldamodel import LdaModel
 from gensim.models.wrappers import LdaMallet
@@ -45,29 +43,35 @@ def testfile():
 
 class TestCoherenceModel(unittest.TestCase):
     def setUp(self):
-        self.badLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=1, random_state=17)  # Bad lda model
-        self.goodLdaModel = LdaModel(corpus=corpus, num_topics=2, passes=50, random_state=17)  # Good lda model
+        # Suppose given below are the topics which two different LdaModels come up with.
+        # `topics1` is clearly better as it has a clear distinction between system-human
+        # interaction and graphs. Hence both the coherence measures for `topics1` should be
+        # greater.
+        self.topics1 = [['human', 'computer', 'system', 'interface'],
+                        ['graph', 'minors', 'trees', 'eps']]
+        self.topics2 = [['user', 'graph', 'minors', 'system'],
+                        ['time', 'graph', 'survey', 'minors']]
 
-    def testUMassLdaModel(self):
-        """Test U_Mass topic coherence algorithm on LDA Model"""
-        cm1 = CoherenceModel(model=self.badLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
-        cm2 = CoherenceModel(model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
-        self.assertTrue(cm1.get_coherence() < cm2.get_coherence())
+    def testUMass(self):
+        """Test U_Mass topic coherence algorithm on given topics"""
+        cm1 = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        cm2 = CoherenceModel(topics=self.topics2, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
 
-    def testCvLdaModel(self):
-        """Test C_v topic coherence algorithm on LDA Model"""
-        cm1 = CoherenceModel(model=self.badLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')
-        cm2 = CoherenceModel(model=self.goodLdaModel, texts=texts, dictionary=dictionary, coherence='c_v')
-        self.assertTrue(cm1.get_coherence() < cm2.get_coherence())
+    def testCv(self):
+        """Test C_v topic coherence algorithm on given topics"""
+        cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_v')
+        cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v')
+        self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
 
     def testErrors(self):
         """Test if errors are raised on bad input"""
         # not providing dictionary
-        self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, coherence='u_mass')
+        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, coherence='u_mass')
         # not providing texts for c_v and instead providing corpus
-        self.assertRaises(ValueError, CoherenceModel, model=self.goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='c_v')
+        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='c_v')
         # not providing corpus or texts for u_mass
-        self.assertRaises(ValueError, CoherenceModel, self.goodLdaModel, dictionary, 'u_mass')
+        self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, dictionary=dictionary, coherence='u_mass')
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

From 336e10c434cc620265975304fd69d6f1d5987f83 Mon Sep 17 00:00:00 2001
From: dsquareindia <ashu.9412@gmail.com>
Date: Fri, 1 Jul 2016 23:42:19 +0530
Subject: [PATCH 10/10] Added topics param documentation to coherencemodel.
 Full test suite for coherencemodel added.

---
 gensim/models/coherencemodel.py    | 11 ++++-
 gensim/test/test_coherencemodel.py | 79 +++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 2c60240b77..615e4efacc 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -24,7 +24,6 @@
 from gensim.topic_coherence import (segmentation, probability_estimation,
                                     direct_confirmation_measure, indirect_confirmation_measure,
                                     aggregation)
-from gensim.corpora import Dictionary
 from gensim.matutils import argsort
 from gensim.utils import is_corpus, FakeDict
 from gensim.models.ldamodel import LdaModel
@@ -45,9 +44,17 @@ class CoherenceModel(interfaces.TransformationABC):
     1. constructor, which initializes the four stage pipeline by accepting a coherence measure,
     2. the ``get_coherence()`` method, which returns the topic coherence.
 
+    One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly
+    provided if the model does not contain a dictionary already.
     >>> cm = CoherenceModel(model=tm, corpus=corpus, coherence='u_mass')  # tm is the trained topic model
     >>> cm.get_coherence()
 
+    Another way of using this feature is through providing tokenized topics such as:
+    >>> topics = [['human', 'computer', 'system', 'interface'],
+                  ['graph', 'minors', 'trees', 'eps']]
+    >>> cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided.
+    >>> cm.get_coherence()
+
     Model persistency is achieved via its load/save methods.
     """
     def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, coherence='c_v'):
@@ -57,7 +64,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
         model : Pre-trained topic model. Should be provided if topics is not provided.
         topics : List of tokenized topics. If this is preferred over model, dictionary should be provided.
                  eg. topics = [['human', 'machine', 'computer', 'interface'],
-                                ['graph', 'trees', 'binary', 'widths']]
+                               ['graph', 'trees', 'binary', 'widths']]
         texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator.
         corpus : Gensim document corpus.
         dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
index 075ecd56d9..057f73d01d 100644
--- a/gensim/test/test_coherencemodel.py
+++ b/gensim/test/test_coherencemodel.py
@@ -20,7 +20,7 @@
 from gensim.models.wrappers import LdaVowpalWabbit
 from gensim.corpora.dictionary import Dictionary
 
-module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
+module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
 datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
 
 # set up vars used in testing ("Deerwester" from the web tutorial)
@@ -51,6 +51,19 @@ def setUp(self):
                         ['graph', 'minors', 'trees', 'eps']]
         self.topics2 = [['user', 'graph', 'minors', 'system'],
                         ['time', 'graph', 'survey', 'minors']]
+        self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0)
+        mallet_home = os.environ.get('MALLET_HOME', None)
+        self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None
+        if self.mallet_path:
+            self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0)
+        vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
+        if not vw_path:
+            msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
+            logging.info(msg)
+            self.vw_path = None
+        else:
+            self.vw_path = vw_path
+            self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0)
 
     def testUMass(self):
         """Test U_Mass topic coherence algorithm on given topics"""
@@ -64,6 +77,56 @@ def testCv(self):
         cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v')
         self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
 
+    def testUMassLdaModel(self):
+        """Perform sanity check to see if u_mass coherence works with LDA Model"""
+        try:
+            cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass')
+        except:
+            raise
+
+    def testCvLdaModel(self):
+        """Perform sanity check to see if c_v coherence works with LDA Model"""
+        try:
+            cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_v')
+        except:
+            raise
+
+    def testUMassMalletModel(self):
+        """Perform sanity check to see if u_mass coherence works with LDA Mallet gensim wrapper"""
+        if not self.mallet_path:
+            return
+        try:
+            cm = CoherenceModel(model=self.malletmodel, corpus=corpus, coherence='u_mass')
+        except:
+            raise
+
+    def testCvMalletModel(self):
+        """Perform sanity check to see if c_v coherence works with LDA Mallet gensim wrapper"""
+        if not self.mallet_path:
+            return
+        try:
+            cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_v')
+        except:
+            raise
+
+    def testUMassVWModel(self):
+        """Perform sanity check to see if u_mass coherence works with LDA VW gensim wrapper"""
+        if not self.vw_path:
+            return
+        try:
+            cm = CoherenceModel(model=self.vwmodel, corpus=corpus, coherence='u_mass')
+        except:
+            raise
+
+    def testCvVWModel(self):
+        """Perform sanity check to see if c_v coherence works with LDA VW gensim wrapper"""
+        if not self.vw_path:
+            return
+        try:
+            cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_v')
+        except:
+            raise
+
     def testErrors(self):
         """Test if errors are raised on bad input"""
         # not providing dictionary
@@ -73,6 +136,20 @@ def testErrors(self):
         # not providing corpus or texts for u_mass
         self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, dictionary=dictionary, coherence='u_mass')
 
+    def testPersistence(self):
+        fname = testfile()
+        model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        model.save(fname)
+        model2 = CoherenceModel.load(fname)
+        self.assertTrue(model.get_coherence() == model2.get_coherence())
+
+    def testPersistenceCompressed(self):
+        fname = testfile() + '.gz'
+        model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass')
+        model.save(fname)
+        model2 = CoherenceModel.load(fname)
+        self.assertTrue(model.get_coherence() == model2.get_coherence())
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()