From 3160deab494eb63de9665efe682db7fd068d17a0 Mon Sep 17 00:00:00 2001 From: Pete Date: Thu, 23 Nov 2017 15:28:08 +0000 Subject: [PATCH 01/26] Added Montemurro and Zanette's entropy-based keyword extraction algorithm --- gensim/summarization/__init__.py | 3 +- gensim/summarization/mz_entropy.py | 69 ++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 gensim/summarization/mz_entropy.py diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py index 57c9a7c815..851a7ba975 100644 --- a/gensim/summarization/__init__.py +++ b/gensim/summarization/__init__.py @@ -1,4 +1,5 @@ # bring model classes directly into package namespace, to save some typing from .summarizer import summarize, summarize_corpus -from .keywords import keywords \ No newline at end of file +from .keywords import keywords +from .mz_entropy import mz_entropy \ No newline at end of file diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py new file mode 100644 index 0000000000..a40688f5b9 --- /dev/null +++ b/gensim/summarization/mz_entropy.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + + +from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word +from gensim.utils import to_unicode +import numpy +import scipy + +def mz_keywords(text,blocksize=1024,scores=False,split=False,weighted=True,threshold=0.0): + """Extract keywords from text using the Montemurro and Zanette entropy algorithm. + https://arxiv.org/abs/0907.1558 + :param text: str (document to summarize) + :param blocksize: int (size of blocks to use in analysis) + :params scores: bool (return score with keywords) + :params split: bool (return results as list) + :params weighted: bool (weight scores by word frequency) + :params threshold: float or 'auto' (minimum score for returned keywords)""" + text=to_unicode(text) + words=_tokenize_by_word(text) + vocab=sorted(set(words)) + wordcounts=numpy.array([[words[i:i+blocksize].count(word) for word in vocab] + for i in range(0,len(words),blocksize)]) + nblocks=wordcounts.shape[0] + totals=wordcounts.sum(axis=0) + nwords=totals.sum() + p=wordcounts/totals + logp=numpy.nan_to_num(numpy.log2(p),0.0) + H=logp.sum(axis=0) + + def log_combinations(n,m): + """Calculates the logarithm of n!/m!(n-m)!""" + return -(numpy.log(n+1)+scipy.special.betaln(n-m+1,m+1)) + + def marginal_prob(n,m): + """Marginal probability of a word that occurs n times in the document + occurring m times in a given block""" + return numpy.exp(log_combinations(n,m) + +log_combinations(nwords-n,blocksize-m) + -log_combinations(nwords,blocksize)) + + marginal=numpy.frompyfunc(marginal_prob,2,1) + + def analytic_entropy(n): + """Predicted entropy for a word that occurs n times in the document""" + m=numpy.arange(1,min(blocksize,n)+1) + p=m/n + elements=p*numpy.nan_to_num(numpy.log2(p))*marginal(n,m) + return -nblocks*elements.sum() + + analytic=numpy.frompyfunc(analytic_entropy,1,1) + + H+=analytic(totals) + if weighted: + H*=totals/nwords + if threshold=='auto': + threshold=nblocks/(nblocks+1.0) + weights=[(word,score) + for (word,score) in zip(vocab,H) + if score>threshold] + weights.sort(key=lambda x:-x[1]) + result= weights if scores else [word for (word,score) in weights] + if not (scores or split): + result='\n'.join(result) + return result + + \ No newline at end of file From 0550651764eebdfb7fbdcb84e983524e0e777b5e Mon Sep 17 00:00:00 2001 From: Pete Date: Thu, 23 Nov 2017 15:44:57 +0000 Subject: [PATCH 02/26] Improved Docstrings --- gensim/summarization/mz_entropy.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index a40688f5b9..11ec3b563d 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -16,8 +16,12 @@ def mz_keywords(text,blocksize=1024,scores=False,split=False,weighted=True,thres :param blocksize: int (size of blocks to use in analysis) :params scores: bool (return score with keywords) :params split: bool (return results as list) - :params weighted: bool (weight scores by word frequency) - :params threshold: float or 'auto' (minimum score for returned keywords)""" + :params weighted: bool (weight scores by word frequency. + False is useful for shorter texts) + :params threshold: float or 'auto' (minimum score for returned keywords + 'auto' calculates the threshold as + nblocks/(nblocks+1.0) + Use 'auto' with weighted=False)""" text=to_unicode(text) words=_tokenize_by_word(text) vocab=sorted(set(words)) From a072f9373e871cf73d71a3d29e3c51c94431353e Mon Sep 17 00:00:00 2001 From: Pete Date: Thu, 23 Nov 2017 17:15:19 +0000 Subject: [PATCH 03/26] Fixed numerical bugs due to zero frequencies --- gensim/summarization/__init__.py | 2 +- gensim/summarization/mz_entropy.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/gensim/summarization/__init__.py b/gensim/summarization/__init__.py index 851a7ba975..1313f95402 100644 --- a/gensim/summarization/__init__.py +++ b/gensim/summarization/__init__.py @@ -2,4 +2,4 @@ # bring model classes directly into package namespace, to save some typing from .summarizer import summarize, summarize_corpus from .keywords import keywords -from .mz_entropy import mz_entropy \ No newline at end of file +from .mz_entropy import mz_keywords \ No newline at end of file diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index 11ec3b563d..8f990b68e6 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -23,16 +23,16 @@ def mz_keywords(text,blocksize=1024,scores=False,split=False,weighted=True,thres nblocks/(nblocks+1.0) Use 'auto' with weighted=False)""" text=to_unicode(text) - words=_tokenize_by_word(text) + words=[word for word in _tokenize_by_word(text)] vocab=sorted(set(words)) wordcounts=numpy.array([[words[i:i+blocksize].count(word) for word in vocab] - for i in range(0,len(words),blocksize)]) + for i in range(0,len(words),blocksize)]).astype('d') nblocks=wordcounts.shape[0] totals=wordcounts.sum(axis=0) nwords=totals.sum() p=wordcounts/totals - logp=numpy.nan_to_num(numpy.log2(p),0.0) - H=logp.sum(axis=0) + logp=numpy.log2(p) + H=numpy.nan_to_num((p*logp),0.0).sum(axis=0) def log_combinations(n,m): """Calculates the logarithm of n!/m!(n-m)!""" @@ -49,14 +49,13 @@ def marginal_prob(n,m): def analytic_entropy(n): """Predicted entropy for a word that occurs n times in the document""" - m=numpy.arange(1,min(blocksize,n)+1) + m=numpy.arange(1,min(blocksize,n)+1).astype('d') p=m/n - elements=p*numpy.nan_to_num(numpy.log2(p))*marginal(n,m) + elements=p*numpy.nan_to_num(numpy.log2(p),0.0)*marginal(n,m) return -nblocks*elements.sum() analytic=numpy.frompyfunc(analytic_entropy,1,1) - - H+=analytic(totals) + H+=analytic(totals).astype('d') if weighted: H*=totals/nwords if threshold=='auto': From c8a3792e76881eec7fd0e7be8150ed80a8e05834 Mon Sep 17 00:00:00 2001 From: Pete Date: Mon, 27 Nov 2017 10:46:38 +0000 Subject: [PATCH 04/26] Coding style changes, test and tutorial --- .../summarization_tutorial-checkpoint.ipynb | 405 +++ docs/notebooks/summarization_tutorial.ipynb | 2377 ++++++++++++++--- gensim/summarization/mz_entropy.py | 155 +- gensim/test/test_summarization.py | 18 +- 4 files changed, 2537 insertions(+), 418 deletions(-) create mode 100644 docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb diff --git a/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb new file mode 100644 index 0000000000..3e4c3f1302 --- /dev/null +++ b/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Tutorial: automatic summarization using Gensim

\n", + "\n", + "This module automatically summarizes the given text, by extracting one or more important sentences from the text. In a similar way, it can also extract keywords. This tutorial will teach you to use this summarization module via some examples. First, we will try a small example, then we will try two larger ones, and then we will review the performance of the summarizer in terms of speed.\n", + "\n", + "This summarizer is based on the \"TextRank\" algorithm, from an [article](http://web.eecs.umich.edu/%7Emihalcea/papers/mihalcea.emnlp04.pdf) by Mihalcea et al. This algorithm was later improved upon by Barrios et al. in another [article](https://raw.githubusercontent.com/summanlp/docs/master/articulo/articulo-en.pdf), by introducing something called a \"BM25 ranking function\". \n", + "\n", + "This tutorial assumes that you are familiar with Python and have [installed Gensim](http://radimrehurek.com/gensim/install.html).\n", + "\n", + "Note: Gensim's summarization only works for English for now, because the text is pre-processed so that stopwords are removed and the words are stemmed, and these processes are language-dependent.\n", + "\n", + "\n", + "

Small example

\n", + "\n", + "First of all, we import the function \"summarize\"." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", + "\n", + "from gensim.summarization import summarize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input text:\n", + "Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion. \n" + ] + } + ], + "source": [ + "text = \"Thomas A. Anderson is a man living two lives. By day he is an \" + \\\n", + " \"average computer programmer and by night a hacker known as \" + \\\n", + " \"Neo. Neo has always questioned his reality, but the truth is \" + \\\n", + " \"far beyond his imagination. Neo finds himself targeted by the \" + \\\n", + " \"police when he is contacted by Morpheus, a legendary computer \" + \\\n", + " \"hacker branded a terrorist by the government. Morpheus awakens \" + \\\n", + " \"Neo to the real world, a ravaged wasteland where most of \" + \\\n", + " \"humanity have been captured by a race of machines that live \" + \\\n", + " \"off of the humans' body heat and electrochemical energy and \" + \\\n", + " \"who imprison their minds within an artificial reality known as \" + \\\n", + " \"the Matrix. As a rebel against the machines, Neo must return to \" + \\\n", + " \"the Matrix and confront the agents: super-powerful computer \" + \\\n", + " \"programs devoted to snuffing out Neo and the entire human \" + \\\n", + " \"rebellion. \"\n", + "\n", + "print 'Input text:'\n", + "print text" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To summarize this text, we pass the raw string data as input to the function \"summarize\", and it will return a summary.\n", + "\n", + "Note: make sure that the string does not contain any newlines where the line breaks in a sentence. A sentence with a newline in it (i.e. a carriage return, \"\\n\") will be treated as two sentences." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n" + ] + } + ], + "source": [ + "print 'Summary:'\n", + "print summarize(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the \"split\" option if you want a list of strings instead of a single string." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.']\n" + ] + } + ], + "source": [ + "print summarize(text, split=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can adjust how much text the summarizer outputs via the \"ratio\" parameter or the \"word_count\" parameter. Using the \"ratio\" parameter, you specify what fraction of sentences in the original text should be returned as output. Below we specify that we want 50% of the original text (the default is 20%)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n", + "Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government.\n", + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" + ] + } + ], + "source": [ + "print 'Summary:'\n", + "print summarize(text, ratio=0.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the \"word_count\" parameter, we specify the maximum amount of words we want in the summary. Below we have specified that we want no more than 50 words." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n" + ] + } + ], + "source": [ + "print 'Summary:'\n", + "print summarize(text, word_count=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As mentioned earlier, this module also supports keyword extraction. Keyword extraction works in the same way as summary generation (i.e. sentence extraction), in that the algorithm tries to find words that are important or seem representative of the entire text. They keywords are not always single words; in the case of multi-word keywords, they are typically all nouns." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Keywords:\n", + "humanity\n", + "human\n", + "neo\n", + "humans body\n", + "super\n", + "hacker\n", + "reality\n" + ] + } + ], + "source": [ + "from gensim.summarization import keywords\n", + "\n", + "print 'Keywords:'\n", + "print keywords(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Larger example

\n", + "\n", + "Let us try an example with a larger piece of text. We will be using a synopsis of the movie \"The Matrix\", which we have taken from [this](http://www.imdb.com/title/tt0133093/synopsis?ref_=ttpl_pl_syn) IMDb page.\n", + "\n", + "In the code below, we read the text file directly from a web-page using \"requests\". Then we produce a summary and some keywords." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "Anderson, a software engineer for a Metacortex, the other life as Neo, a computer hacker \"guilty of virtually every computer crime we have a law for.\" Agent Smith asks him to help them capture Morpheus, a dangerous terrorist, in exchange for amnesty.\n", + "Morpheus explains that he's been searching for Neo his entire life and asks if Neo feels like \"Alice in Wonderland, falling down the rabbit hole.\" He explains to Neo that they exist in the Matrix, a false reality that has been constructed for humans to hide the truth.\n", + "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse (Matt Doran).\n", + "Cypher cuts up a juicy steak and ruminates that he knows the steak is merely the simulation telling his brain that it is delicious and juicy, but after nine years he has discovered that \"ignorance is bliss.\" He strikes a deal for the machines to reinsert his body into a power plant, reinsert him into the Matrix, and he'll help the Agents.\n", + "\n", + "Keywords:\n", + "neo\n", + "morpheus\n", + "trinity\n", + "cypher\n", + "agents\n", + "agent\n", + "smith\n", + "tank\n", + "says\n", + "saying\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", + "\n", + "print 'Summary:'\n", + "print summarize(text, ratio=0.01)\n", + "\n", + "print '\\nKeywords:'\n", + "print keywords(text, ratio=0.01)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you know this movie, you see that this summary is actually quite good. We also see that some of the most important characters (Neo, Morpheus, Trinity) were extracted as keywords.\n", + "\n", + "

Another example

\n", + "\n", + "Let's try an example similar to the one above. This time, we will use the [IMDb synopsis](http://www.imdb.com/title/tt0118715/synopsis?ref_=tt_stry_pl) of \"The Big Lebowski\".\n", + "\n", + "Again, we download the text and produce a summary and some keywords." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "The answering machine records a woman introducing herself as Maude Lebowski and saying that she is the one who took his rug and has sent a car to pick Dude up at his apartment.\n", + "As he climbs out of bed to make a White Russian, Maude asks about the apartment and Dude explains that Treehorn's thugs most likely vandalized it looking for Lebowski's money.\n", + "\n", + "Keywords:\n", + "dude\n", + "dudes\n", + "walter\n", + "lebowski\n", + "brandt\n", + "maude\n", + "donny\n", + "bunny\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\n", + "\n", + "print 'Summary:'\n", + "print summarize(text, ratio=0.01)\n", + "\n", + "print '\\nKeywords:'\n", + "print keywords(text, ratio=0.01)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time around, the summary is not of high quality, as it does not tell us much about the movie. In a way, this might not be the algorithms fault, rather this text simply doesn't contain one or two sentences that capture the essence of the text as in \"The Matrix\" synopsis.\n", + "\n", + "The keywords, however, managed to find some of the main characters.\n", + "\n", + "

Performance

\n", + "\n", + "We will test how the speed of the summarizer scales with the size of the dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 processor. Note that the summarizer does not support multithreading (parallel processing).\n", + "\n", + "The tests were run on the book \"Honest Abe\" by Alonzo Rothschild. Download the book in plain-text here. \n", + "\n", + "In the plot below, we see the running times together with the sizes of the datasets. To create datasets of different sizes, we have simply taken prefixes of text; in other words we take the first n characters of the book. The algorithm seems to be quadratic in time, so one needs to be careful before plugging a large dataset into the summarizer.\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "

Text-content dependent running times

\n", + "\n", + "The running time is not only dependent on the size of the dataset. For example, summarizing \"The Matrix\" synopsis (about 36,000 characters) takes about 3.1 seconds, while summarizing 35,000 characters of this book takes about 8.5 seconds. So the former is more than twice as fast. \n", + "\n", + "One reason for this difference in running times is the data structure that is used. The algorithm represents the data using a graph, where vertices (nodes) are sentences, and then constructs weighted edges between the vertices that represent how the sentences relate to each other. This means that every piece of text will have a different graph, thus making the running times different. The size of this data structure is quadratic in the worst case (the worst case is when each vertex has an edge to every other vertex).\n", + "\n", + "Another possible reason for the difference in running times is that the problems converge at different rates, meaning that the error drops slower for some datasets than for others.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Montemurro and Zanette's Entropy-based keyword algorithm\n", + "[This algorithm](https://arxiv.org/abs/0907.1558) finds keywords based on their contribution to the structure of the document on large scales. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/notebooks/summarization_tutorial.ipynb b/docs/notebooks/summarization_tutorial.ipynb index 6ef81218bf..4cd3f47017 100644 --- a/docs/notebooks/summarization_tutorial.ipynb +++ b/docs/notebooks/summarization_tutorial.ipynb @@ -1,406 +1,2057 @@ { - "metadata": { - "name": "", - "signature": "sha256:6b9b76544213a02f8bf906cdada222aa43d1d502664b11cd363728bc96c21b5f" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ + "cells": [ { - "cells": [ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Tutorial: automatic summarization using Gensim

\n", + "\n", + "This module automatically summarizes the given text, by extracting one or more important sentences from the text. In a similar way, it can also extract keywords. This tutorial will teach you to use this summarization module via some examples. First, we will try a small example, then we will try two larger ones, and then we will review the performance of the summarizer in terms of speed.\n", + "\n", + "This summarizer is based on the \"TextRank\" algorithm, from an [article](http://web.eecs.umich.edu/%7Emihalcea/papers/mihalcea.emnlp04.pdf) by Mihalcea et al. This algorithm was later improved upon by Barrios et al. in another [article](https://raw.githubusercontent.com/summanlp/docs/master/articulo/articulo-en.pdf), by introducing something called a \"BM25 ranking function\". \n", + "\n", + "This tutorial assumes that you are familiar with Python and have [installed Gensim](http://radimrehurek.com/gensim/install.html).\n", + "\n", + "Note: Gensim's summarization only works for English for now, because the text is pre-processed so that stopwords are removed and the words are stemmed, and these processes are language-dependent.\n", + "\n", + "\n", + "

Small example

\n", + "\n", + "First of all, we import the function \"summarize\"." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "

Tutorial: automatic summarization using Gensim

\n", - "\n", - "This module automatically summarizes the given text, by extracting one or more important sentences from the text. In a similar way, it can also extract keywords. This tutorial will teach you to use this summarization module via some examples. First, we will try a small example, then we will try two larger ones, and then we will review the performance of the summarizer in terms of speed.\n", - "\n", - "This summarizer is based on the \"TextRank\" algorithm, from an [article](http://web.eecs.umich.edu/%7Emihalcea/papers/mihalcea.emnlp04.pdf) by Mihalcea et al. This algorithm was later improved upon by Barrios et al. in another [article](https://raw.githubusercontent.com/summanlp/docs/master/articulo/articulo-en.pdf), by introducing something called a \"BM25 ranking function\". \n", - "\n", - "This tutorial assumes that you are familiar with Python and have [installed Gensim](http://radimrehurek.com/gensim/install.html).\n", - "\n", - "Note: Gensim's summarization only works for English for now, because the text is pre-processed so that stopwords are removed and the words are stemmed, and these processes are language-dependent.\n", - "\n", - "\n", - "

Small example

\n", - "\n", - "First of all, we import the function \"summarize\"." + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-27 10:32:21,357 : INFO : 'pattern' package not found; tag filters are not available for English\n" ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import logging\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", - "\n", - "from gensim.summarization import summarize" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 4 - }, + } + ], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", + "\n", + "from gensim.summarization import summarize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example." + "name": "stdout", + "output_type": "stream", + "text": [ + "Input text:\n", + "Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion. \n" ] - }, + } + ], + "source": [ + "text = \"Thomas A. Anderson is a man living two lives. By day he is an \" + \\\n", + " \"average computer programmer and by night a hacker known as \" + \\\n", + " \"Neo. Neo has always questioned his reality, but the truth is \" + \\\n", + " \"far beyond his imagination. Neo finds himself targeted by the \" + \\\n", + " \"police when he is contacted by Morpheus, a legendary computer \" + \\\n", + " \"hacker branded a terrorist by the government. Morpheus awakens \" + \\\n", + " \"Neo to the real world, a ravaged wasteland where most of \" + \\\n", + " \"humanity have been captured by a race of machines that live \" + \\\n", + " \"off of the humans' body heat and electrochemical energy and \" + \\\n", + " \"who imprison their minds within an artificial reality known as \" + \\\n", + " \"the Matrix. As a rebel against the machines, Neo must return to \" + \\\n", + " \"the Matrix and confront the agents: super-powerful computer \" + \\\n", + " \"programs devoted to snuffing out Neo and the entire human \" + \\\n", + " \"rebellion. \"\n", + "\n", + "print 'Input text:'\n", + "print text" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To summarize this text, we pass the raw string data as input to the function \"summarize\", and it will return a summary.\n", + "\n", + "Note: make sure that the string does not contain any newlines where the line breaks in a sentence. A sentence with a newline in it (i.e. a carriage return, \"\\n\") will be treated as two sentences." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "code", - "collapsed": false, - "input": [ - "text = \"Thomas A. Anderson is a man living two lives. By day he is an \" + \\\n", - " \"average computer programmer and by night a hacker known as \" + \\\n", - " \"Neo. Neo has always questioned his reality, but the truth is \" + \\\n", - " \"far beyond his imagination. Neo finds himself targeted by the \" + \\\n", - " \"police when he is contacted by Morpheus, a legendary computer \" + \\\n", - " \"hacker branded a terrorist by the government. Morpheus awakens \" + \\\n", - " \"Neo to the real world, a ravaged wasteland where most of \" + \\\n", - " \"humanity have been captured by a race of machines that live \" + \\\n", - " \"off of the humans' body heat and electrochemical energy and \" + \\\n", - " \"who imprison their minds within an artificial reality known as \" + \\\n", - " \"the Matrix. As a rebel against the machines, Neo must return to \" + \\\n", - " \"the Matrix and confront the agents: super-powerful computer \" + \\\n", - " \"programs devoted to snuffing out Neo and the entire human \" + \\\n", - " \"rebellion. \"\n", - "\n", - "print 'Input text:'\n", - "print text" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Input text:\n", - "Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion. \n" - ] - } - ], - "prompt_number": 5 + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-27 10:32:21,382 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-27 10:32:21,382 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-27 10:32:21,383 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-27 10:32:21,384 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To summarize this text, we pass the raw string data as input to the function \"summarize\", and it will return a summary.\n", - "\n", - "Note: make sure that the string does not contain any newlines where the line breaks in a sentence. A sentence with a newline in it (i.e. a carriage return, \"\\n\") will be treated as two sentences." + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" ] - }, + } + ], + "source": [ + "print 'Summary:'\n", + "print summarize(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the \"split\" option if you want a list of strings instead of a single string." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "code", - "collapsed": false, - "input": [ - "print 'Summary:'\n", - "print summarize(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Summary:\n", - "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n" - ] - } - ], - "prompt_number": 6 + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-27 10:32:21,401 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-27 10:32:21,401 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-27 10:32:21,402 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-27 10:32:21,403 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the \"split\" option if you want a list of strings instead of a single string." + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\"]\n" ] - }, + } + ], + "source": [ + "print summarize(text, split=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can adjust how much text the summarizer outputs via the \"ratio\" parameter or the \"word_count\" parameter. Using the \"ratio\" parameter, you specify what fraction of sentences in the original text should be returned as output. Below we specify that we want 50% of the original text (the default is 20%)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "code", - "collapsed": false, - "input": [ - "print summarize(text, split=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "['By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.']\n" - ] - } - ], - "prompt_number": 6 + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-27 10:32:21,413 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-27 10:32:21,413 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-27 10:32:21,414 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-27 10:32:21,415 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can adjust how much text the summarizer outputs via the \"ratio\" parameter or the \"word_count\" parameter. Using the \"ratio\" parameter, you specify what fraction of sentences in the original text should be returned as output. Below we specify that we want 50% of the original text (the default is 20%)." + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n", + "Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government.\n", + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" ] - }, + } + ], + "source": [ + "print 'Summary:'\n", + "print summarize(text, ratio=0.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the \"word_count\" parameter, we specify the maximum amount of words we want in the summary. Below we have specified that we want no more than 50 words." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "code", - "collapsed": false, - "input": [ - "print 'Summary:'\n", - "print summarize(text, ratio=0.5)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Summary:\n", - "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n", - "Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government.\n", - "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" - ] - } - ], - "prompt_number": 8 + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-27 10:32:21,422 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-27 10:32:21,423 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-27 10:32:21,424 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-27 10:32:21,424 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the \"word_count\" parameter, we specify the maximum amount of words we want in the summary. Below we have specified that we want no more than 50 words." + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" ] - }, + } + ], + "source": [ + "print 'Summary:'\n", + "print summarize(text, word_count=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As mentioned earlier, this module also supports keyword extraction. Keyword extraction works in the same way as summary generation (i.e. sentence extraction), in that the algorithm tries to find words that are important or seem representative of the entire text. They keywords are not always single words; in the case of multi-word keywords, they are typically all nouns." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "code", - "collapsed": false, - "input": [ - "print 'Summary:'\n", - "print summarize(text, word_count=50)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Summary:\n", - "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n" - ] - } - ], - "prompt_number": 9 - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Keywords:\n", + "humanity\n", + "human\n", + "neo\n", + "humans body\n", + "super\n", + "hacker\n", + "reality\n" + ] + } + ], + "source": [ + "from gensim.summarization import keywords\n", + "\n", + "print 'Keywords:'\n", + "print keywords(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Larger example

\n", + "\n", + "Let us try an example with a larger piece of text. We will be using a synopsis of the movie \"The Matrix\", which we have taken from [this](http://www.imdb.com/title/tt0133093/synopsis?ref_=ttpl_pl_syn) IMDb page.\n", + "\n", + "In the code below, we read the text file directly from a web-page using \"requests\". Then we produce a summary and some keywords." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned earlier, this module also supports keyword extraction. Keyword extraction works in the same way as summary generation (i.e. sentence extraction), in that the algorithm tries to find words that are important or seem representative of the entire text. They keywords are not always single words; in the case of multi-word keywords, they are typically all nouns." + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-27 10:32:21,950 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-27 10:32:21,955 : INFO : built Dictionary(1093 unique tokens: [u'code', u'squiddi', u'relai', u'dinosaur', u'electron']...) from 416 documents (total 2985 corpus positions)\n" ] }, { - "cell_type": "code", - "collapsed": false, - "input": [ - "from gensim.summarization import keywords\n", + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "Anderson, a software engineer for a Metacortex, the other life as Neo, a computer hacker \"guilty of virtually every computer crime we have a law for.\" Agent Smith asks him to help them capture Morpheus, a dangerous terrorist, in exchange for amnesty.\n", + "Morpheus explains that he's been searching for Neo his entire life and asks if Neo feels like \"Alice in Wonderland, falling down the rabbit hole.\" He explains to Neo that they exist in the Matrix, a false reality that has been constructed for humans to hide the truth.\n", + "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse (Matt Doran).\n", + "Cypher cuts up a juicy steak and ruminates that he knows the steak is merely the simulation telling his brain that it is delicious and juicy, but after nine years he has discovered that \"ignorance is bliss.\" He strikes a deal for the machines to reinsert his body into a power plant, reinsert him into the Matrix, and he'll help the Agents.\n", "\n", - "print 'Keywords:'\n", - "print keywords(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Keywords:\n", - "humanity\n", - "human\n", - "neo\n", - "humans body\n", - "super\n", - "hacker\n", - "reality\n" - ] - } - ], - "prompt_number": 10 - }, + "Keywords:\n", + "neo\n", + "morpheus\n", + "trinity\n", + "cypher\n", + "agents\n", + "agent\n", + "smith\n", + "tank\n", + "says\n", + "saying\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", + "\n", + "print 'Summary:'\n", + "print summarize(text, ratio=0.01)\n", + "\n", + "print '\\nKeywords:'\n", + "print keywords(text, ratio=0.01)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you know this movie, you see that this summary is actually quite good. We also see that some of the most important characters (Neo, Morpheus, Trinity) were extracted as keywords.\n", + "\n", + "

Another example

\n", + "\n", + "Let's try an example similar to the one above. This time, we will use the [IMDb synopsis](http://www.imdb.com/title/tt0118715/synopsis?ref_=tt_stry_pl) of \"The Big Lebowski\".\n", + "\n", + "Again, we download the text and produce a summary and some keywords." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "

Larger example

\n", - "\n", - "Let us try an example with a larger piece of text. We will be using a synopsis of the movie \"The Matrix\", which we have taken from [this](http://www.imdb.com/title/tt0133093/synopsis?ref_=ttpl_pl_syn) IMDb page.\n", - "\n", - "In the code below, we read the text file directly from a web-page using \"requests\". Then we produce a summary and some keywords." + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-27 10:32:24,360 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-27 10:32:24,365 : INFO : built Dictionary(1054 unique tokens: [u'fawn', u'windi', u'concept', u'doctor', u'gant']...) from 227 documents (total 2434 corpus positions)\n" ] }, { - "cell_type": "code", - "collapsed": false, - "input": [ - "import requests\n", + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "The answering machine records a woman introducing herself as Maude Lebowski and saying that she is the one who took his rug and has sent a car to pick Dude up at his apartment.\n", + "As he climbs out of bed to make a White Russian, Maude asks about the apartment and Dude explains that Treehorn's thugs most likely vandalized it looking for Lebowski's money.\n", "\n", - "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", - "\n", - "print 'Summary:'\n", - "print summarize(text, ratio=0.01)\n", - "\n", - "print '\\nKeywords:'\n", - "print keywords(text, ratio=0.01)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Summary:\n", - "Anderson, a software engineer for a Metacortex, the other life as Neo, a computer hacker \"guilty of virtually every computer crime we have a law for.\" Agent Smith asks him to help them capture Morpheus, a dangerous terrorist, in exchange for amnesty.\n", - "Morpheus explains that he's been searching for Neo his entire life and asks if Neo feels like \"Alice in Wonderland, falling down the rabbit hole.\" He explains to Neo that they exist in the Matrix, a false reality that has been constructed for humans to hide the truth.\n", - "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse (Matt Doran).\n", - "Cypher cuts up a juicy steak and ruminates that he knows the steak is merely the simulation telling his brain that it is delicious and juicy, but after nine years he has discovered that \"ignorance is bliss.\" He strikes a deal for the machines to reinsert his body into a power plant, reinsert him into the Matrix, and he'll help the Agents." - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Keywords:\n", - "neo\n", - "morpheus\n", - "trinity\n", - "cypher\n", - "agents\n", - "agent\n", - "smith\n", - "tank\n", - "says\n", - "saying" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n" - ] - } - ], - "prompt_number": 12 - }, + "Keywords:\n", + "dude\n", + "dudes\n", + "walter\n", + "lebowski\n", + "brandt\n", + "maude\n", + "donny\n", + "bunny\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\n", + "\n", + "print 'Summary:'\n", + "print summarize(text, ratio=0.01)\n", + "\n", + "print '\\nKeywords:'\n", + "print keywords(text, ratio=0.01)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time around, the summary is not of high quality, as it does not tell us much about the movie. In a way, this might not be the algorithms fault, rather this text simply doesn't contain one or two sentences that capture the essence of the text as in \"The Matrix\" synopsis.\n", + "\n", + "The keywords, however, managed to find some of the main characters.\n", + "\n", + "

Performance

\n", + "\n", + "We will test how the speed of the summarizer scales with the size of the dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 processor. Note that the summarizer does not support multithreading (parallel processing).\n", + "\n", + "The tests were run on the book \"Honest Abe\" by Alonzo Rothschild. Download the book in plain-text here. \n", + "\n", + "In the plot below, we see the running times together with the sizes of the datasets. To create datasets of different sizes, we have simply taken prefixes of text; in other words we take the first n characters of the book. The algorithm seems to be quadratic in time, so one needs to be careful before plugging a large dataset into the summarizer.\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "

Text-content dependent running times

\n", + "\n", + "The running time is not only dependent on the size of the dataset. For example, summarizing \"The Matrix\" synopsis (about 36,000 characters) takes about 3.1 seconds, while summarizing 35,000 characters of this book takes about 8.5 seconds. So the former is more than twice as fast. \n", + "\n", + "One reason for this difference in running times is the data structure that is used. The algorithm represents the data using a graph, where vertices (nodes) are sentences, and then constructs weighted edges between the vertices that represent how the sentences relate to each other. This means that every piece of text will have a different graph, thus making the running times different. The size of this data structure is quadratic in the worst case (the worst case is when each vertex has an edge to every other vertex).\n", + "\n", + "Another possible reason for the difference in running times is that the problems converge at different rates, meaning that the error drops slower for some datasets than for others.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Montemurro and Zanette's Entropy-based keyword algorithm\n", + "[This algorithm](https://arxiv.org/abs/0907.1558) finds keywords based on their contribution to the structure of the document on large scales. It does so by dividing the document into blocks of around 1000 words, calculating the entropy of a word's distribution over the blocks, and comparing this to the expected entropy if the word were randomly distributed. To illustrate the algorithm, we will download \"Honest Abe\", and extract some keywords." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import urllib2\n", + "from gensim.summarization import mz_keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you know this movie, you see that this summary is actually quite good. We also see that some of the most important characters (Neo, Morpheus, Trinity) were extracted as keywords.\n", - "\n", - "

Another example

\n", - "\n", - "Let's try an example similar to the one above. This time, we will use the [IMDb synopsis](http://www.imdb.com/title/tt0118715/synopsis?ref_=tt_stry_pl) of \"The Big Lebowski\".\n", - "\n", - "Again, we download the text and produce a summary and some keywords." + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/pete/gensim/gensim/summarization/mz_entropy.py:78: RuntimeWarning: divide by zero encountered in log2\n", + " logp = numpy.log2(p)\n", + "/home/pete/gensim/gensim/summarization/mz_entropy.py:79: RuntimeWarning: invalid value encountered in multiply\n", + " H = numpy.nan_to_num((p * logp), 0.0).sum(axis=0)\n" ] }, { - "cell_type": "code", - "collapsed": false, - "input": [ - "import requests\n", - "\n", - "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\n", - "\n", - "print 'Summary:'\n", - "print summarize(text, ratio=0.01)\n", - "\n", - "print '\\nKeywords:'\n", - "print keywords(text, ratio=0.01)" - ], - "language": "python", + "data": { + "text/plain": [ + "[(u'lincoln', 0.0056009079527401728),\n", + " (u'i', 0.0048480807199453163),\n", + " (u'gutenberg', 0.0033118705607652456),\n", + " (u'you', 0.0033044241876850882),\n", + " (u'the', 0.003184223100952537),\n", + " (u'project', 0.0030400432599562814),\n", + " (u'v', 0.0029892072316233462),\n", + " (u's', 0.0027479946846166391),\n", + " (u'he', 0.0026405628272363011),\n", + " (u'iv', 0.0025895621076850355),\n", + " (u'ii', 0.0025019507619403148),\n", + " (u'by', 0.0022277723676676691),\n", + " (u'abraham', 0.0021168707666022494),\n", + " (u'or', 0.0020858843371172162),\n", + " (u'iii', 0.002071167621155823),\n", + " (u'tm', 0.0019565820396828327),\n", + " (u'was', 0.0018954215033062955),\n", + " (u'his', 0.0018126024538229718),\n", + " (u'work', 0.0017646814365061972),\n", + " (u'co', 0.0017416964820475558),\n", + " (u'case', 0.001661734006946057),\n", + " (u'new', 0.0016558607106467698),\n", + " (u'york', 0.0015861543846297651),\n", + " (u'court', 0.0014488333654852606),\n", + " (u'a', 0.0013369063978456374),\n", + " (u'it', 0.0013221654971075282),\n", + " (u'had', 0.0012652752682645698),\n", + " (u'on', 0.0012621040038518136),\n", + " (u'their', 0.0012449891448184512),\n", + " (u'herndon', 0.0012402952190743249),\n", + " (u'life', 0.00123104152062403),\n", + " (u'my', 0.0011741303053317792),\n", + " (u'_works_', 0.0010832651550141503),\n", + " (u'we', 0.0010768294653523067),\n", + " (u'money', 0.0010191083741917691),\n", + " (u'father', 0.0010168268194887184)]" + ] + }, + "execution_count": 11, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "Summary:\n", - "The answering machine records a woman introducing herself as Maude Lebowski and saying that she is the one who took his rug and has sent a car to pick Dude up at his apartment.\n", - "As he climbs out of bed to make a White Russian, Maude asks about the apartment and Dude explains that Treehorn's thugs most likely vandalized it looking for Lebowski's money." - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n", - "\n", - "Keywords:\n", - "dude\n", - "dudes\n", - "walter\n", - "lebowski\n", - "brandt\n", - "maude\n", - "donny\n", - "bunny" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n" - ] - } - ], - "prompt_number": 13 - }, + "output_type": "execute_result" + } + ], + "source": [ + "book = urllib2.urlopen(\"http://www.gutenberg.org/files/49679/49679-0.txt\")\n", + "text = book.read()\n", + "mz_keywords(text,scores=True,threshold=0.001)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, this algorithm weights keywords by their overall frequency of occurrence in the text. It's possible to turn the weighting off, as below" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "cell_type": "markdown", + "data": { + "text/plain": [ + "[(u'gutenberg', 3.7766363961259684),\n", + " (u'tm', 3.6403066998316511),\n", + " (u'project', 3.5428530523255342),\n", + " (u'co', 3.2983688146004528),\n", + " (u'donations', 2.8613536046553563),\n", + " (u'electronic', 2.8210861922674084),\n", + " (u'access', 2.7810662866642568),\n", + " (u'refund', 2.7810662866642568),\n", + " (u'foundation', 2.7234464816769872),\n", + " (u'foxboro', 2.5477601487545121),\n", + " (u'gloves', 2.5281337853661761),\n", + " (u'e', 2.4036269322210768),\n", + " (u'york', 2.3692008259770594),\n", + " (u'edited', 2.361641829495754),\n", + " (u'_works_', 2.3445174072327686),\n", + " (u'works', 2.3426500474551113),\n", + " (u'dogskin', 2.3425994588269479),\n", + " (u'ragsdale', 2.2931552327841351),\n", + " (u'replacement', 2.2931552327841351),\n", + " (u'trunks', 2.2931552327841351),\n", + " (u'iv', 2.2510299269025058),\n", + " (u'iii', 2.2186807817292546),\n", + " (u'v', 2.2168420707754368),\n", + " (u'brokaw', 2.1699176369612583),\n", + " (u'coon', 2.1699176369612583),\n", + " (u'bonds', 2.1343080503770544),\n", + " (u'license', 2.1009287665795293),\n", + " (u'ii', 2.0892470886183649),\n", + " (u'agreement', 2.0779209847210556),\n", + " (u'almanac', 2.0060727272918055),\n", + " (u'_weekly_', 1.9794475925140163),\n", + " (u'bounded', 1.9794475925140163),\n", + " (u'format', 1.9794475925140163),\n", + " (u'millions', 1.9794475925140163),\n", + " (u'oxen', 1.9794475925140163),\n", + " (u'specie', 1.9794475925140163),\n", + " (u'archive', 1.9682995275030786),\n", + " (u'barrett', 1.9422319940872796),\n", + " (u'reminiscences', 1.9330537427622287),\n", + " (u'ebooks', 1.8984698469769548),\n", + " (u'forquer', 1.8843080503770544),\n", + " (u'parker', 1.8843080503770544),\n", + " (u'pglaf', 1.8843080503770544),\n", + " (u'ebook', 1.8838775575675983),\n", + " (u'trademark', 1.8838775575675983),\n", + " (u'paragraph', 1.8301079379685583),\n", + " (u'hardin', 1.7669683658081703),\n", + " (u'work', 1.7328354724344326),\n", + " (u'rothschild', 1.7275730939964973),\n", + " (u'org', 1.7211393195188851),\n", + " (u'attitude', 1.716230650790012),\n", + " (u'london', 1.6791112857988695),\n", + " (u'boston', 1.6754810009833907),\n", + " (u'xvi', 1.66018729770736),\n", + " (u'news', 1.6601872977073597),\n", + " (u'biographical', 1.6294643147000225),\n", + " (u'green', 1.6254512602292723),\n", + " (u'delegates', 1.6127555612626692),\n", + " (u'medium', 1.6127555612626692),\n", + " (u'scripps', 1.6127555612626692),\n", + " (u'volunteers', 1.6127555612626692),\n", + " (u'lamon', 1.6001560607245646),\n", + " (u'tarbell', 1.5897346234235084),\n", + " (u'volumes', 1.5819481863246514),\n", + " (u'bank', 1.5744728128489647),\n", + " (u'copyright', 1.5731550611734115),\n", + " (u'_via_', 1.5722781569106761),\n", + " (u'admissibility', 1.5722781569106761),\n", + " (u'advertisers', 1.5722781569106761),\n", + " (u'applicable', 1.5722781569106761),\n", + " (u'attire', 1.5722781569106761),\n", + " (u'bags', 1.5722781569106761),\n", + " (u'berries', 1.5722781569106761),\n", + " (u'breeches', 1.5722781569106761),\n", + " (u'cline', 1.5722781569106761),\n", + " (u'continuance', 1.5722781569106761),\n", + " (u'currents', 1.5722781569106761),\n", + " (u'daguerreotype', 1.5722781569106761),\n", + " (u'disclaimer', 1.5722781569106761),\n", + " (u'email', 1.5722781569106761),\n", + " (u'enrolled', 1.5722781569106761),\n", + " (u'fool', 1.5722781569106761),\n", + " (u'guineas', 1.5722781569106761),\n", + " (u'hatchet', 1.5722781569106761),\n", + " (u'instruct', 1.5722781569106761),\n", + " (u'liability', 1.5722781569106761),\n", + " (u'lonny', 1.5722781569106761),\n", + " (u'paullin', 1.5722781569106761),\n", + " (u'performing', 1.5722781569106761),\n", + " (u'plow', 1.5722781569106761),\n", + " (u'polite', 1.5722781569106761),\n", + " (u'puffs', 1.5722781569106761),\n", + " (u'rulings', 1.5722781569106761),\n", + " (u'scammon', 1.5722781569106761),\n", + " (u'tilda', 1.5722781569106761),\n", + " (u'wake', 1.5722781569106761),\n", + " (u'warranties', 1.5722781569106761),\n", + " (u'america', 1.5712271378967728),\n", + " (u'clair', 1.5712271378967728),\n", + " (u'displaying', 1.5712271378967728),\n", + " (u'forgery', 1.5712271378967728),\n", + " (u'holder', 1.5712271378967728),\n", + " (u'posted', 1.5712271378967728),\n", + " (u'sketches', 1.5712271378967728),\n", + " (u'snow', 1.5712271378967728),\n", + " (u'wore', 1.5712271378967728),\n", + " (u'http', 1.5645865830262038),\n", + " (u'journalism', 1.5399471126066209),\n", + " (u'copy', 1.5258495075146912),\n", + " (u'_early', 1.5202411939312348),\n", + " (u'armstrong', 1.5106440743450187),\n", + " (u'railroad', 1.4938165623572677),\n", + " (u'ross', 1.489097832809857),\n", + " (u'pair', 1.4791112857988695),\n", + " (u'banks', 1.4791112857988693),\n", + " (u'irelan', 1.4791112857988693),\n", + " (u'scott', 1.4791112857988693),\n", + " (u'browne', 1.4764336408243595),\n", + " (u'abraham', 1.4577679329151634),\n", + " (u'publication', 1.4490612388306794),\n", + " (u'provide', 1.4490612388306792),\n", + " (u'chiniquy', 1.4275140308616106),\n", + " (u'literary', 1.4150354420715021),\n", + " (u'rr', 1.4070491486733681),\n", + " (u'axe', 1.3967912341407889),\n", + " (u'fence', 1.3967912341407889),\n", + " (u'genuine', 1.3967912341407889),\n", + " (u'life_', 1.3941370904272503),\n", + " (u'she', 1.3923582867044937),\n", + " (u'copper', 1.3828069220574104),\n", + " (u'distributing', 1.3828069220574104),\n", + " (u'saddle', 1.3828069220574104),\n", + " (u'sons', 1.3828069220574104),\n", + " (u'_life_', 1.373910241709706),\n", + " (u'calhoun', 1.373910241709706),\n", + " (u'mother', 1.3728688332198922),\n", + " (u'college', 1.3697302821858961),\n", + " (u'nicolay', 1.3633245760231363),\n", + " (u'whitney', 1.3627575629840512),\n", + " (u'philadelphia', 1.3540886863558637),\n", + " (u'sarah', 1.3540886863558634),\n", + " (u'vi', 1.3540886863558634),\n", + " (u'harrison', 1.3476159735283106),\n", + " (u'terms', 1.3426509824683515),\n", + " (u'herndon', 1.3421892681433798),\n", + " (u'improvement', 1.329344333012155),\n", + " (u'buckskin', 1.3222046383294666),\n", + " (u'sham', 1.3222046383294666),\n", + " (u'fee', 1.3158554460066139),\n", + " (u'generosity', 1.3144503596878891),\n", + " (u'moore', 1.3144503596878887),\n", + " (u'copies', 1.3127747798184011),\n", + " (u'p', 1.309088202039181),\n", + " (u'compliance', 1.2961309813666892),\n", + " (u'constable', 1.2961309813666892),\n", + " (u'currency', 1.2961309813666892),\n", + " (u'distribution', 1.2961309813666892),\n", + " (u'harvey', 1.2961309813666892),\n", + " (u'individual', 1.2961309813666892),\n", + " (u'revolutionary', 1.2961309813666892),\n", + " (u'brooks', 1.286562189794501),\n", + " (u'chicago', 1.2700186510810929),\n", + " (u'weems', 1.2659709073661847),\n", + " (u'february', 1.2574199029295277),\n", + " (u'information', 1.2487001310514776),\n", + " (u'bridge', 1.2326416539256813),\n", + " (u'resolution', 1.2268390166084573),\n", + " (u'stoddard', 1.2268390166084573),\n", + " (u'father', 1.2254034208363418),\n", + " (u'cartwright', 1.2157428532629155),\n", + " (u'houghton', 1.2157428532629155),\n", + " (u'publishing', 1.2157428532629155),\n", + " (u'describes', 1.2157428532629153),\n", + " (u'j', 1.2115310804189017),\n", + " (u'_stories_', 1.2049337080807629),\n", + " (u'september', 1.2030636155192291),\n", + " (u'boys', 1.1974364414369618),\n", + " (u'defendants', 1.1955861748361873),\n", + " (u'per', 1.1955861748361873),\n", + " (u'permission', 1.1955861748361873),\n", + " (u'uncle', 1.1955861748361873),\n", + " (u'thomas', 1.1924565577943991),\n", + " (u'trade', 1.1918333507609624),\n", + " (u'f', 1.1915163381561049),\n", + " (u'store', 1.189052998865439),\n", + " (u'notes', 1.1850922942502753),\n", + " (u'baker', 1.1828856976412236),\n", + " (u'baddeley', 1.1681694680548835),\n", + " (u'cogdal', 1.1681694680548835),\n", + " (u'copying', 1.1681694680548835),\n", + " (u'crafton', 1.1681694680548835),\n", + " (u'defect', 1.1681694680548835),\n", + " (u'donate', 1.1681694680548835),\n", + " (u'easier', 1.1681694680548835),\n", + " (u'editions', 1.1681694680548835),\n", + " (u'hawley', 1.1681694680548835),\n", + " (u'hitchcock', 1.1681694680548835),\n", + " (u'jake', 1.1681694680548835),\n", + " (u'jewelry', 1.1681694680548835),\n", + " (u'jurors', 1.1681694680548835),\n", + " (u'lightning', 1.1681694680548835),\n", + " (u'machine', 1.1681694680548835),\n", + " (u'paragraphs', 1.1681694680548835),\n", + " (u'pg', 1.1681694680548835),\n", + " (u'pork', 1.1681694680548835),\n", + " (u'retains', 1.1681694680548835),\n", + " (u'rod', 1.1681694680548835),\n", + " (u'securities', 1.1681694680548835),\n", + " (u'status', 1.1681694680548835),\n", + " (u'trousers', 1.1681694680548835),\n", + " (u'unpublished', 1.1681694680548835),\n", + " (u'berry', 1.1644932670010606),\n", + " (u'pp', 1.1608077284905565),\n", + " (u'hanks', 1.1587285139891437),\n", + " (u'mcclure', 1.1537352404836496),\n", + " (u'her', 1.1531891574151381),\n", + " (u'hamlin', 1.1529222466025137),\n", + " (u'speeches', 1.1437050469373577),\n", + " (u'kentucky', 1.1401563236722736),\n", + " (u'johnston', 1.1368073989967304),\n", + " (u'offutt', 1.1345503657246403),\n", + " (u'dress', 1.1343080503770544),\n", + " (u'german', 1.1343080503770544),\n", + " (u'matheney', 1.1343080503770544),\n", + " (u'company', 1.1298148326748745),\n", + " (u'g', 1.128517881924167),\n", + " (u'votes', 1.1187730676938106),\n", + " (u'nine', 1.113374076177045),\n", + " (u'charles', 1.1065580194728426),\n", + " (u'note', 1.0974655406391749),\n", + " (u'deed', 1.0970926363431248),\n", + " (u'east', 1.0970926363431248),\n", + " (u'spurious', 1.0970926363431248),\n", + " (u'atkinson', 1.0970926363431244),\n", + " (u'comply', 1.0970926363431244),\n", + " (u'jewelers', 1.0970926363431244),\n", + " (u'leland', 1.0970926363431244),\n", + " (u'priest', 1.0970926363431244),\n", + " (u'soldier', 1.0970926363431244),\n", + " (u'd', 1.0936709970367389),\n", + " (u'tax', 1.0890978328098568),\n", + " (u'colonel', 1.0886122317272675),\n", + " (u'pitcher', 1.0886122317272675),\n", + " (u'spink', 1.0886122317272675),\n", + " (u'charter', 1.0886122317272673),\n", + " (u'clock', 1.0886122317272673),\n", + " (u'distribute', 1.0886122317272673),\n", + " (u'fisher', 1.0886122317272673),\n", + " (u'convention', 1.0842245322470756),\n", + " (u'plaintiff', 1.0813648643938589),\n", + " (u'island', 1.0791112857988696),\n", + " (u'voyage', 1.0772490318253176),\n", + " (u'you', 1.0716742799027257),\n", + " (u'road', 1.0587290524017576),\n", + " (u'holland', 1.05373524048365),\n", + " (u'trailor', 1.0479900750043671),\n", + " (u'limited', 1.0447190713617185),\n", + " (u'domain', 1.0399471126066209),\n", + " (u'grandfather', 1.0399471126066209),\n", + " (u'voted', 1.0399471126066209),\n", + " (u'agree', 1.0367857078081339),\n", + " (u'including', 1.0367857078081339),\n", + " (u'life', 1.0279778291629844),\n", + " (u'witness', 1.0249646422762066),\n", + " (u'james', 1.0153080476245506),\n", + " (u'stuart', 1.0149104889383316),\n", + " (u'dungee', 1.0102738780733427),\n", + " (u'john', 1.0074378828094916),\n", + " (u'surveyor', 1.0071083505332288),\n", + " (u'cross', 1.0008479040802145),\n", + " (u'dollars', 1.0002448365299736),\n", + " (u'president', 0.99828026284480487),\n", + " (u'_amount_', 0.99450922395310026),\n", + " (u'_black', 0.99450922395310026),\n", + " (u'_commercial', 0.99450922395310026),\n", + " (u'_magazine', 0.99450922395310026),\n", + " (u'_nicolay', 0.99450922395310026),\n", + " (u'_north', 0.99450922395310026),\n", + " (u'_sun_', 0.99450922395310026),\n", + " (u'accompanies', 0.99450922395310026),\n", + " (u'accordance', 0.99450922395310026),\n", + " (u'adjourning', 0.99450922395310026),\n", + " (u'advertiser', 0.99450922395310026),\n", + " (u'advertiser_', 0.99450922395310026),\n", + " (u'agnosticism', 0.99450922395310026),\n", + " (u'almanacs', 0.99450922395310026),\n", + " (u'animals', 0.99450922395310026),\n", + " (u'apparel', 0.99450922395310026),\n", + " (u'appoints', 0.99450922395310026),\n", + " (u'arbitrations', 0.99450922395310026),\n", + " (u'ascii', 0.99450922395310026),\n", + " (u'asks', 0.99450922395310026),\n", + " (u'aspirants', 0.99450922395310026),\n", + " (u'atrocious', 0.99450922395310026),\n", + " (u'attachment', 0.99450922395310026),\n", + " (u'authors', 0.99450922395310026),\n", + " (u'band', 0.99450922395310026),\n", + " (u'bargained', 0.99450922395310026),\n", + " (u'bets', 0.99450922395310026),\n", + " (u'bleeding', 0.99450922395310026),\n", + " (u'boats', 0.99450922395310026),\n", + " (u'book_', 0.99450922395310026),\n", + " (u'boss', 0.99450922395310026),\n", + " (u'bourgeois', 0.99450922395310026),\n", + " (u'bull', 0.99450922395310026),\n", + " (u'calf', 0.99450922395310026),\n", + " (u'chase', 0.99450922395310026),\n", + " (u'chicanery', 0.99450922395310026),\n", + " (u'coach', 0.99450922395310026),\n", + " (u'coins', 0.99450922395310026),\n", + " (u'comet', 0.99450922395310026),\n", + " (u'computer', 0.99450922395310026),\n", + " (u'computers', 0.99450922395310026),\n", + " (u'concentration', 0.99450922395310026),\n", + " (u'conquering', 0.99450922395310026),\n", + " (u'conservator', 0.99450922395310026),\n", + " (u'contentedly', 0.99450922395310026),\n", + " (u'copied', 0.99450922395310026),\n", + " (u'cord', 0.99450922395310026),\n", + " (u'cornell', 0.99450922395310026),\n", + " (u'countenance', 0.99450922395310026),\n", + " (u'counting', 0.99450922395310026),\n", + " (u'countryman', 0.99450922395310026),\n", + " (u'creeks', 0.99450922395310026),\n", + " (u'davy', 0.99450922395310026),\n", + " (u'deer', 0.99450922395310026),\n", + " (u'def', 0.99450922395310026),\n", + " (u'delegations', 0.99450922395310026),\n", + " (u'deliveries', 0.99450922395310026),\n", + " (u'demurrer', 0.99450922395310026),\n", + " (u'desires', 0.99450922395310026),\n", + " (u'detriment', 0.99450922395310026),\n", + " (u'directors', 0.99450922395310026),\n", + " (u'disallows', 0.99450922395310026),\n", + " (u'disgracing', 0.99450922395310026),\n", + " (u'doctoring', 0.99450922395310026),\n", + " (u'effectively', 0.99450922395310026),\n", + " (u'elections', 0.99450922395310026),\n", + " (u'electronically', 0.99450922395310026),\n", + " (u'enrolling', 0.99450922395310026),\n", + " (u'exempt', 0.99450922395310026),\n", + " (u'faded', 0.99450922395310026),\n", + " (u'fares', 0.99450922395310026),\n", + " (u'ff', 0.99450922395310026),\n", + " (u'fights', 0.99450922395310026),\n", + " (u'flatboat', 0.99450922395310026),\n", + " (u'founded', 0.99450922395310026),\n", + " (u'generals', 0.99450922395310026),\n", + " (u'goose', 0.99450922395310026),\n", + " (u'greed', 0.99450922395310026),\n", + " (u'groomsman', 0.99450922395310026),\n", + " (u'hagerty', 0.99450922395310026),\n", + " (u'hans', 0.99450922395310026),\n", + " (u'harvard', 0.99450922395310026),\n", + " (u'haute', 0.99450922395310026),\n", + " (u'heel', 0.99450922395310026),\n", + " (u'history_', 0.99450922395310026),\n", + " (u'homeliest', 0.99450922395310026),\n", + " (u'howard', 0.99450922395310026),\n", + " (u'hut', 0.99450922395310026),\n", + " (u'ice', 0.99450922395310026),\n", + " (u'ida', 0.99450922395310026),\n", + " (u'identical', 0.99450922395310026),\n", + " (u'imperialist', 0.99450922395310026),\n", + " (u'independent', 0.99450922395310026),\n", + " (u'invalid', 0.99450922395310026),\n", + " (u'irons', 0.99450922395310026),\n", + " (u'janet', 0.99450922395310026),\n", + " (u'justification', 0.99450922395310026),\n", + " (u'lamborn', 0.99450922395310026),\n", + " (u'lambs', 0.99450922395310026),\n", + " (u'larceny', 0.99450922395310026),\n", + " (u'latin', 0.99450922395310026),\n", + " (u'linen', 0.99450922395310026),\n", + " (u'locations', 0.99450922395310026),\n", + " (u'louder', 0.99450922395310026),\n", + " (u'mad', 0.99450922395310026),\n", + " (u'magruder', 0.99450922395310026),\n", + " (u'maid', 0.99450922395310026),\n", + " (u'metaphysical', 0.99450922395310026),\n", + " (u'mit', 0.99450922395310026),\n", + " (u'monthlies', 0.99450922395310026),\n", + " (u'nest', 0.99450922395310026),\n", + " (u'nigger', 0.99450922395310026),\n", + " (u'package', 0.99450922395310026),\n", + " (u'pan', 0.99450922395310026),\n", + " (u'parentage', 0.99450922395310026),\n", + " (u'partial', 0.99450922395310026),\n", + " (u'partly', 0.99450922395310026),\n", + " (u'passengers', 0.99450922395310026),\n", + " (u'pension', 0.99450922395310026),\n", + " (u'pl', 0.99450922395310026),\n", + " (u'playful', 0.99450922395310026),\n", + " (u'population', 0.99450922395310026),\n", + " (u'postponed', 0.99450922395310026),\n", + " (u'postponement', 0.99450922395310026),\n", + " (u'premise', 0.99450922395310026),\n", + " (u'pressure', 0.99450922395310026),\n", + " (u'presumption', 0.99450922395310026),\n", + " (u'preventing', 0.99450922395310026),\n", + " (u'quart', 0.99450922395310026),\n", + " (u'quincy', 0.99450922395310026),\n", + " (u'quorum', 0.99450922395310026),\n", + " (u'redistribution', 0.99450922395310026),\n", + " (u'rejoicing', 0.99450922395310026),\n", + " (u'remit', 0.99450922395310026),\n", + " (u'rifle', 0.99450922395310026),\n", + " (u'romance', 0.99450922395310026),\n", + " (u'rothschild_', 0.99450922395310026),\n", + " (u'row', 0.99450922395310026),\n", + " (u'rubbish', 0.99450922395310026),\n", + " (u'sacrifices', 0.99450922395310026),\n", + " (u'scroll', 0.99450922395310026),\n", + " (u'shade', 0.99450922395310026),\n", + " (u'shed', 0.99450922395310026),\n", + " (u'sigh', 0.99450922395310026),\n", + " (u'silk', 0.99450922395310026),\n", + " (u'sinewy', 0.99450922395310026),\n", + " (u'sock', 0.99450922395310026),\n", + " (u'solicit', 0.99450922395310026),\n", + " (u'solvent', 0.99450922395310026),\n", + " (u'sonny', 0.99450922395310026),\n", + " (u'startling', 0.99450922395310026),\n", + " (u'steals', 0.99450922395310026),\n", + " (u'steamer', 0.99450922395310026),\n", + " (u'stevenson', 0.99450922395310026),\n", + " (u'subp\\u0153naed', 0.99450922395310026),\n", + " (u'tanned', 0.99450922395310026),\n", + " (u'tea', 0.99450922395310026),\n", + " (u'terre', 0.99450922395310026),\n", + " (u'theosophy', 0.99450922395310026),\n", + " (u'tight', 0.99450922395310026),\n", + " (u'tis', 0.99450922395310026),\n", + " (u'tour', 0.99450922395310026),\n", + " (u'vanilla', 0.99450922395310026),\n", + " (u'vol', 0.99450922395310026),\n", + " (u'warfare', 0.99450922395310026),\n", + " (u'warranty', 0.99450922395310026),\n", + " (u'wayne', 0.99450922395310026),\n", + " (u'whip', 0.99450922395310026),\n", + " (u'woodcut', 0.99450922395310026),\n", + " (u'wright', 0.99450922395310026),\n", + " (u'new', 0.99212250974463601),\n", + " (u'swett', 0.98946460737046715),\n", + " (u'vote', 0.98946460737046715),\n", + " (u'argument', 0.98558697994489508),\n", + " (u'whig', 0.98356004695062138),\n", + " (u'bros', 0.97944759251401625),\n", + " (u'harper', 0.97944759251401625),\n", + " (u'jeremiah', 0.97944759251401625),\n", + " (u'metzker', 0.97944759251401625),\n", + " (u'clay', 0.96981219799970608),\n", + " (u'_the', 0.96570796937579129),\n", + " (u'h', 0.95765441503007809),\n", + " (u'benjamin', 0.94868978259839132),\n", + " (u'captain', 0.94868978259839132),\n", + " (u'crawford', 0.94558617483618734),\n", + " (u'web', 0.94558617483618734),\n", + " (u'cents', 0.93351879101119639),\n", + " (u'physical', 0.92813378536617597),\n", + " (u'alonzo', 0.92486723054286335),\n", + " (u'april', 0.92486723054286335),\n", + " (u'barrel', 0.92220463832946642),\n", + " (u'butler', 0.92220463832946642),\n", + " (u'cabinet', 0.92220463832946642),\n", + " (u'cincinnati', 0.92220463832946642),\n", + " (u'committees', 0.92220463832946642),\n", + " (u'containing', 0.92220463832946642),\n", + " (u'corner', 0.92220463832946642),\n", + " (u'daily', 0.92220463832946642),\n", + " (u'defective', 0.92220463832946642),\n", + " (u'greeley', 0.92220463832946642),\n", + " (u'inches', 0.92220463832946642),\n", + " (u'johnson', 0.92220463832946642),\n", + " (u'limitation', 0.92220463832946642),\n", + " (u'site', 0.92220463832946642),\n", + " (u'steamboat', 0.92220463832946642),\n", + " (u'suffice', 0.92220463832946642),\n", + " (u'trips', 0.92220463832946642),\n", + " (u'university', 0.92220463832946642),\n", + " (u'vulnerable', 0.92220463832946642),\n", + " (u'william', 0.92182669138259055),\n", + " (u'buys', 0.91921942236647736),\n", + " (u'mifflin', 0.91921942236647736),\n", + " (u'morris', 0.91921942236647736),\n", + " (u'refuses', 0.91921942236647736),\n", + " (u'sells', 0.91921942236647736),\n", + " (u'tom', 0.91921942236647736),\n", + " (u'widow', 0.91921942236647736),\n", + " (u'sold', 0.91058236515865065),\n", + " (u'gentry', 0.90964424191141902),\n", + " (u'sound', 0.90964424191141902),\n", + " (u'march', 0.90184519263849161),\n", + " (u'anecdotes', 0.89647288909731682),\n", + " (u'trading', 0.89300683234257683),\n", + " (u'lack', 0.87911128579886944),\n", + " (u'orleans', 0.87911128579886944),\n", + " (u'sketch', 0.87911128579886944),\n", + " (u'laws', 0.87819506388264168),\n", + " (u'court', 0.87301265479021239),\n", + " (u'fees', 0.87255211331288773),\n", + " (u'introduction', 0.86973028218589565),\n", + " (u'boy', 0.85605385441424486),\n", + " (u'reports', 0.85227169920605483),\n", + " (u'witnesses', 0.85227169920605483),\n", + " (u'b', 0.8493617351165339),\n", + " (u'abe', 0.84568544478024776),\n", + " (u'magazine_', 0.84481219799970608),\n", + " (u'henry', 0.84251319517689804),\n", + " (u'speed', 0.84212114395694027),\n", + " (u'anecdote', 0.83889967717920211),\n", + " (u'society', 0.83631295591025312),\n", + " (u'judgment', 0.83278478699937608),\n", + " (u'creek', 0.83070878764523437),\n", + " (u'slander', 0.82845569625516102),\n", + " (u'r', 0.82845569625516058),\n", + " (u'earn', 0.82576785278214482),\n", + " (u'client', 0.82347269328712169),\n", + " (u'case', 0.81966768916575017),\n", + " (u'menard', 0.81137835062883923),\n", + " (u'ohio', 0.81137835062883923),\n", + " (u'w', 0.80428968429945247),\n", + " (u'married', 0.80362253558252261),\n", + " (u'write', 0.80362253558252261),\n", + " (u'study', 0.79632966848691966),\n", + " (u'_', 0.79325323203556808),\n", + " (u'party', 0.79159811445620765),\n", + " (u'contained', 0.78994711260662109),\n", + " (u'handling', 0.78994711260662109),\n", + " (u'judd', 0.78994711260662109),\n", + " (u'obscure', 0.78994711260662109),\n", + " (u'pratt', 0.78994711260662109),\n", + " (u'rolling', 0.78994711260662109),\n", + " (u'van', 0.78994711260662109),\n", + " (u'hay', 0.78877059308059705),\n", + " (u'care', 0.78490422698683959),\n", + " (u'supreme', 0.78462528298567236),\n", + " (u'letter', 0.77973386004105638),\n", + " (u'me', 0.77433431986501056),\n", + " (u'money', 0.771981872795795),\n", + " (u'circular', 0.771227137896773),\n", + " (u'contributions', 0.771227137896773),\n", + " (u'elizabethtown', 0.771227137896773),\n", + " (u'fails', 0.771227137896773),\n", + " (u'knows', 0.771227137896773),\n", + " (u'located', 0.771227137896773),\n", + " (u'pennsylvania', 0.771227137896773),\n", + " (u'pledged', 0.771227137896773),\n", + " (u'providing', 0.771227137896773),\n", + " (u'sentence', 0.771227137896773),\n", + " (u'shirt', 0.771227137896773),\n", + " (u'size', 0.771227137896773),\n", + " (u'sleeve', 0.771227137896773),\n", + " (u'jury', 0.77018773870139601),\n", + " (u'justice', 0.76953789330855393),\n", + " (u'stories', 0.76708395277153052),\n", + " (u'political', 0.76430322684478202),\n", + " (u'mrs', 0.76262458115983867),\n", + " (u'gentlemen', 0.76084736419494403),\n", + " (u'plea', 0.76084736419494403),\n", + " (u'honest', 0.7601545702334116),\n", + " (u'archibald', 0.75527889839393425),\n", + " (u'argued', 0.75527889839393425),\n", + " (u'coin', 0.75527889839393425),\n", + " (u'creditor', 0.75527889839393425),\n", + " (u'endorses', 0.75527889839393425),\n", + " (u'floyd', 0.75527889839393425),\n", + " (u'fraud', 0.75527889839393425),\n", + " (u'frederick', 0.75527889839393425),\n", + " (u'hale', 0.75527889839393425),\n", + " (u'heels', 0.75527889839393425),\n", + " (u'legally', 0.75527889839393425),\n", + " (u'maker', 0.75527889839393425),\n", + " (u'mchenry', 0.75527889839393425),\n", + " (u'memorial', 0.75527889839393425),\n", + " (u'partisan', 0.75527889839393425),\n", + " (u'personally', 0.75527889839393425),\n", + " (u'race', 0.75527889839393425),\n", + " (u'stevens', 0.75527889839393425),\n", + " (u'submit', 0.75527889839393425),\n", + " (u'admitted', 0.75527889839393403),\n", + " (u'file', 0.75527889839393403),\n", + " (u'hardships', 0.75527889839393403),\n", + " (u'music', 0.75527889839393403),\n", + " (u'refusal', 0.75527889839393403),\n", + " (u'sangamon', 0.75131617260654426),\n", + " (u'january', 0.75014968948543359),\n", + " (u'hill', 0.74969894675964133),\n", + " (u'truth', 0.74926914745020134),\n", + " (u'hon', 0.74391569849198458),\n", + " (u'douglas', 0.74391569849198413),\n", + " (u'earned', 0.74391569849198413),\n", + " (u'guilty', 0.74391569849198413),\n", + " (u'campaign', 0.73772110820519954),\n", + " (u'log', 0.7346246403825254),\n", + " (u'coffin', 0.73455036572464039),\n", + " (u'arnold', 0.72909340011582602),\n", + " (u'expenses', 0.72909340011582602),\n", + " (u'suit', 0.72889466428426264),\n", + " (u'sir', 0.72813378536617623),\n", + " (u'damages', 0.72813378536617579),\n", + " (u'george', 0.72713812318435789),\n", + " (u'owner', 0.72605378988157332),\n", + " (u'poverty', 0.71981219799970608),\n", + " (u'representatives', 0.71981219799970608),\n", + " (u'river', 0.71959767653412277),\n", + " (u'gridley', 0.71526365099729405),\n", + " (u'letters', 0.71513361836592892),\n", + " (u'rock', 0.71513361836592892),\n", + " (u'use', 0.71454647320175191),\n", + " (u'washington', 0.71171778456659318),\n", + " (u'central', 0.7099478124556069),\n", + " (u'election', 0.70657488299084736),\n", + " (u'tell', 0.70626746228172177),\n", + " (u'salem', 0.70613716823020756),\n", + " (u'section', 0.70306361551922913),\n", + " (u'radford', 0.69558617483618734),\n", + " (u'www', 0.69558617483618734),\n", + " (u'congress', 0.69556362076674905),\n", + " (u'candidate', 0.69184503181253953),\n", + " (u'hundred', 0.69164202681964948),\n", + " (u'counterfeit', 0.68742201968919669),\n", + " (u'ethical', 0.68742201968919669),\n", + " (u'hoblit', 0.68742201968919669),\n", + " (u'jones', 0.68742201968919669),\n", + " (u'silver', 0.68742201968919669),\n", + " (u'land', 0.68722988386024664),\n", + " (u'committee', 0.68422453224707569),\n", + " (u'logan', 0.68405655526716558),\n", + " (u'democrats', 0.67765948196948589),\n", + " (u'see', 0.67486874148436105),\n", + " (u'integrity', 0.67102513071863612),\n", + " (u'debt', 0.66934835459409747),\n", + " (u'session', 0.66560926895684913),\n", + " (u'paper', 0.65789112937712302),\n", + " (u'american', 0.65765491292160583),\n", + " (u'_atlantic', 0.65398232285618652),\n", + " (u'adjournment', 0.65398232285618652),\n", + " (u'advertising', 0.65398232285618652),\n", + " (u'affidavit', 0.65398232285618652),\n", + " (u'afraid', 0.65398232285618652),\n", + " (u'albany', 0.65398232285618652),\n", + " (u'altered', 0.65398232285618652),\n", + " (u'anti', 0.65398232285618652),\n", + " (u'armed', 0.65398232285618652),\n", + " (u'aspiration', 0.65398232285618652),\n", + " (u'ate', 0.65398232285618652),\n", + " (u'athens', 0.65398232285618652),\n", + " (u'baron', 0.65398232285618652),\n", + " (u'borrows', 0.65398232285618652),\n", + " (u'breaking', 0.65398232285618652),\n", + " (u'car', 0.65398232285618652),\n", + " (u'cargo', 0.65398232285618652),\n", + " (u'cedar', 0.65398232285618652),\n", + " (u'cheap', 0.65398232285618652),\n", + " (u'chew', 0.65398232285618652),\n", + " (u'clarke', 0.65398232285618652),\n", + " (u'coles', 0.65398232285618652),\n", + " (u'combat', 0.65398232285618652),\n", + " (u'constitution', 0.65398232285618652),\n", + " (u'crippled', 0.65398232285618652),\n", + " (u'customer', 0.65398232285618652),\n", + " (u'dared', 0.65398232285618652),\n", + " (u'darkness', 0.65398232285618652),\n", + " (u'deftly', 0.65398232285618652),\n", + " (u'derivative', 0.65398232285618652),\n", + " (u'doubleday', 0.65398232285618652),\n", + " (u'dresser', 0.65398232285618652),\n", + " (u'eleanor', 0.65398232285618652),\n", + " (u'emulation', 0.65398232285618652),\n", + " (u'entity', 0.65398232285618652),\n", + " (u'equipment', 0.65398232285618652),\n", + " (u'estimation', 0.65398232285618652),\n", + " (u'exceptional', 0.65398232285618652),\n", + " (u'exert', 0.65398232285618652),\n", + " (u'expedition', 0.65398232285618652),\n", + " (u'fortified', 0.65398232285618652),\n", + " (u'frail', 0.65398232285618652),\n", + " (u'francis', 0.65398232285618652),\n", + " (u'fun', 0.65398232285618652),\n", + " (u'generations', 0.65398232285618652),\n", + " (u'gov', 0.65398232285618652),\n", + " (u'grip', 0.65398232285618652),\n", + " (u'grudge', 0.65398232285618652),\n", + " (u'hartford', 0.65398232285618652),\n", + " (u'hawk_', 0.65398232285618652),\n", + " (u'heirs', 0.65398232285618652),\n", + " (u'hicks', 0.65398232285618652),\n", + " (u'hit', 0.65398232285618652),\n", + " (u'hog', 0.65398232285618652),\n", + " (u'ignorant', 0.65398232285618652),\n", + " (u'instinct', 0.65398232285618652),\n", + " (u'irwin', 0.65398232285618652),\n", + " (u'kankakee', 0.65398232285618652),\n", + " (u'keckley', 0.65398232285618652),\n", + " (u'ketcham', 0.65398232285618652),\n", + " (u'killed', 0.65398232285618652),\n", + " (u'kingsbury', 0.65398232285618652),\n", + " (u'lasted', 0.65398232285618652),\n", + " (u'licensed', 0.65398232285618652),\n", + " (u'links', 0.65398232285618652),\n", + " (u'lippincott', 0.65398232285618652),\n", + " (u'load', 0.65398232285618652),\n", + " (u'mania', 0.65398232285618652),\n", + " (u'marries', 0.65398232285618652),\n", + " (u'mcclurg', 0.65398232285618652),\n", + " (u'md', 0.65398232285618652),\n", + " (u'mile', 0.65398232285618652),\n", + " (u'miller', 0.65398232285618652),\n", + " (u'monthly_', 0.65398232285618652),\n", + " (u'morally', 0.65398232285618652),\n", + " (u'necessity', 0.65398232285618652),\n", + " (u'norris', 0.65398232285618652),\n", + " (u'norton', 0.65398232285618652),\n", + " (u'opening', 0.65398232285618652),\n", + " (u'orr', 0.65398232285618652),\n", + " (u'ossian', 0.65398232285618652),\n", + " (u'overheard', 0.65398232285618652),\n", + " (u'parent', 0.65398232285618652),\n", + " (u'peachy', 0.65398232285618652),\n", + " (u'petition', 0.65398232285618652),\n", + " (u'pinching', 0.65398232285618652),\n", + " (u'plunged', 0.65398232285618652),\n", + " (u'plutarch', 0.65398232285618652),\n", + " (u'policies', 0.65398232285618652),\n", + " (u'pound', 0.65398232285618652),\n", + " (u'presenting', 0.65398232285618652),\n", + " (u'press_', 0.65398232285618652),\n", + " (u'primm', 0.65398232285618652),\n", + " (u'procured', 0.65398232285618652),\n", + " (u'profaned', 0.65398232285618652),\n", + " (u'prominently', 0.65398232285618652),\n", + " (u'pushing', 0.65398232285618652),\n", + " (u'qualified', 0.65398232285618652),\n", + " (u'questionable', 0.65398232285618652),\n", + " (u'ramsay', 0.65398232285618652),\n", + " (u'rapids', 0.65398232285618652),\n", + " (u'recollection', 0.65398232285618652),\n", + " (u'religion', 0.65398232285618652),\n", + " (u'reluctantly', 0.65398232285618652),\n", + " (u'render', 0.65398232285618652),\n", + " (u'repeal', 0.65398232285618652),\n", + " (u'represent', 0.65398232285618652),\n", + " (u'resourcefulness', 0.65398232285618652),\n", + " (u'responsible', 0.65398232285618652),\n", + " (u'retaining', 0.65398232285618652),\n", + " (u'review_', 0.65398232285618652),\n", + " (u'revolution', 0.65398232285618652),\n", + " (u'ridiculous', 0.65398232285618652),\n", + " (u'rightful', 0.65398232285618652),\n", + " (u'ruled', 0.65398232285618652),\n", + " (u'rural', 0.65398232285618652),\n", + " (u'satisfactory', 0.65398232285618652),\n", + " (u'sayings', 0.65398232285618652),\n", + " (u'scarcely', 0.65398232285618652),\n", + " (u'score', 0.65398232285618652),\n", + " (u'sheep', 0.65398232285618652),\n", + " (u'shuffling', 0.65398232285618652),\n", + " (u'sleeves', 0.65398232285618652),\n", + " (u'sources', 0.65398232285618652),\n", + " (u'street', 0.65398232285618652),\n", + " (u'sues', 0.65398232285618652),\n", + " (u'suing', 0.65398232285618652),\n", + " (u'sumner', 0.65398232285618652),\n", + " (u'sundry', 0.65398232285618652),\n", + " (u'suspicious', 0.65398232285618652),\n", + " (u'taylor', 0.65398232285618652),\n", + " (u'torch', 0.65398232285618652),\n", + " (u'trent', 0.65398232285618652),\n", + " (u'umbrella', 0.65398232285618652),\n", + " (u'unassuming', 0.65398232285618652),\n", + " (u'ungainly', 0.65398232285618652),\n", + " (u'user', 0.65398232285618652),\n", + " (u'victim', 0.65398232285618652),\n", + " (u'vs', 0.65398232285618652),\n", + " (u'wildcat', 0.65398232285618652),\n", + " (u'xxxvii', 0.65398232285618652),\n", + " (u'opponent', 0.65154286586887178),\n", + " (u'trial', 0.64758856417750366),\n", + " (u'edition', 0.64663751443697892),\n", + " (u'onstot', 0.64663751443697892),\n", + " (u'robert', 0.64050211515912991),\n", + " (u'october', 0.6401563236722736),\n", + " (u'also', 0.6385436730575007),\n", + " (u'davis', 0.63634793477555185),\n", + " (u'additional', 0.63350513665219177),\n", + " (u'enemies', 0.63350513665219177),\n", + " (u'pleas', 0.63350513665219177),\n", + " (u'provided', 0.63350513665219177),\n", + " (u'rev', 0.63350513665219177),\n", + " (u'stranger', 0.63350513665219177),\n", + " (u'_versus_', 0.63350513665219133),\n", + " (u'allen', 0.63350513665219133),\n", + " (u'brockett', 0.63350513665219133),\n", + " (u'editorial', 0.63350513665219133),\n", + " (u'emerson', 0.63350513665219133),\n", + " (u'enlarged', 0.63350513665219133),\n", + " (u'manuscript', 0.63350513665219133),\n", + " (u'massachusetts', 0.63350513665219133),\n", + " (u'patterson', 0.63350513665219133),\n", + " (u'raymond', 0.63350513665219133),\n", + " (u'smoot', 0.63350513665219133),\n", + " (u'weekly_', 0.63350513665219133),\n", + " (u'states', 0.63133846043671316),\n", + " (u'l', 0.62992122927427241),\n", + " (u'advised', 0.62946431470002273),\n", + " (u'agent', 0.62946431470002273),\n", + " (u'bunn', 0.62946431470002273),\n", + " (u'controversy', 0.62946431470002273),\n", + " (u'josiah', 0.62946431470002273),\n", + " (u'legislative', 0.62946431470002273),\n", + " (u'online', 0.62946431470002273),\n", + " (u'pigeon', 0.62946431470002273),\n", + " (u'spencer', 0.62946431470002273),\n", + " (u'adjourned', 0.62946431470002251),\n", + " (u'inside', 0.62946431470002251),\n", + " (u'jesse', 0.62946431470002251),\n", + " (u'jurymen', 0.62946431470002251),\n", + " (u'million', 0.62946431470002251),\n", + " (u'moon', 0.62946431470002251),\n", + " (u'preacher', 0.62946431470002251),\n", + " (u'using', 0.62946431470002251),\n", + " (u'or', 0.62843784044209627),\n", + " (u'litigation', 0.62545126022927233),\n", + " (u'newspaper', 0.62545126022927233),\n", + " (u'defendant', 0.62290734657589919),\n", + " (u'my', 0.62258846505370258),\n", + " (u'associated', 0.62098413888097026),\n", + " (u'counsel', 0.61859284602665277),\n", + " (u'toward', 0.61454730339646257),\n", + " (u'history', 0.61124124826162474),\n", + " (u'quoted', 0.6084865654230116),\n", + " (u'issue', 0.60729254662829124),\n", + " (u'dollar', 0.60729254662829035),\n", + " (u'major', 0.60362253558252288),\n", + " (u'prairie', 0.60362253558252288),\n", + " (u'experiences', 0.60362253558252243),\n", + " (u'legislation', 0.60362253558252243),\n", + " (u'_vs', 0.6035456305599225),\n", + " (u'historical', 0.6035456305599225),\n", + " (u'produced', 0.6035456305599225),\n", + " (u'bergen', 0.60354563055992205),\n", + " (u'sheriff', 0.60354563055992205),\n", + " (u'springfield', 0.60331495800961132),\n", + " (u'i', 0.60133778763850376),\n", + " (u'horse', 0.60121703239130708),\n", + " (u'century', 0.60105378988157332),\n", + " (u'clients', 0.59743816921816073),\n", + " (u'goes', 0.59710003352803476),\n", + " (u'charge', 0.59649450823617922),\n", + " (u'murder', 0.59481219799970608),\n", + " (u'plain', 0.58605571895428143),\n", + " (u'us', 0.58221475738787731),\n", + " (u'town', 0.58012788024267969),\n", + " (u'house', 0.57887700145220933),\n", + " (u'indiana', 0.57824649026730324),\n", + " (u'politicians', 0.57824649026730324),\n", + " (u'evidence', 0.5773563125339356),\n", + " (u'united', 0.5773563125339356),\n", + " (u'_lincoln_', 0.57724903182531806),\n", + " (u'dear', 0.57724903182531762),\n", + " (u'we', 0.57676182641453266),\n", + " (u'based', 0.57524724626967227),\n", + " (u'farmer', 0.57524724626967227),\n", + " (u'statute', 0.57524724626967227),\n", + " (u'your', 0.57484862840063666),\n", + " (u'm', 0.56962689226253538),\n", + " (u'testimony', 0.56654837085837473),\n", + " (u't', 0.56572841285799047),\n", + " (u'why', 0.56025440339254207),\n", + " (u'judge', 0.55838252725925042),\n", + " (u'questions', 0.55586577584685459),\n", + " (u'politician', 0.55230686834859721),\n", + " (u'poor', 0.55230686834859721),\n", + " (u'mr', 0.55178896100960007),\n", + " (u'illustrated', 0.55089119891374239),\n", + " (u'south', 0.55089119891374239),\n", + " (u'bad', 0.55041258498620138),\n", + " (u'wrong', 0.54883327250182479),\n", + " (u'attorneys', 0.54775614656095595),\n", + " (u'sure', 0.54763172911448921),\n", + " (u'nomination', 0.5437356699215532),\n", + " (u'coat', 0.53994711260662109),\n", + " (u'corporation', 0.53994711260662109),\n", + " (u'hapgood', 0.53994711260662109),\n", + " (u'larger', 0.53994711260662109),\n", + " (u'match', 0.53994711260662109),\n", + " (u'matteson', 0.53994711260662109),\n", + " (u'requirements', 0.53994711260662109),\n", + " (u'team', 0.53994711260662109),\n", + " (u'understanding', 0.53994711260662109),\n", + " (u'speech', 0.53442414980620789),\n", + " (u'carpenter', 0.5334454691791124),\n", + " (u'dennis', 0.5334454691791124),\n", + " (u'funds', 0.5334454691791124),\n", + " (u'papers', 0.5334454691791124),\n", + " (u'since', 0.5334454691791124),\n", + " (u'stage', 0.5334454691791124),\n", + " (u'jackson', 0.53344546917911195),\n", + " (u'duff', 0.52813378536617561),\n", + " (u'hat', 0.52813378536617561),\n", + " (u'wants', 0.52813378536617561),\n", + " (u'services', 0.52723560020577054),\n", + " (u'_mcclure', 0.5256640649145532),\n", + " (u'borrowed', 0.5256640649145532),\n", + " (u'bush', 0.5256640649145532),\n", + " (u'collect', 0.5256640649145532),\n", + " (u'demand', 0.5256640649145532),\n", + " (u'examined', 0.5256640649145532),\n", + " (u'majority', 0.5256640649145532),\n", + " (u'marshall', 0.5256640649145532),\n", + " (u'mean', 0.5256640649145532),\n", + " (u'n', 0.5256640649145532),\n", + " (u'partners', 0.5256640649145532),\n", + " (u'thompson', 0.5256640649145532),\n", + " (u'voting', 0.5256640649145532),\n", + " (u'worn', 0.5256640649145532),\n", + " (u'night', 0.52052348222604294),\n", + " (u'prove', 0.5183559396399664),\n", + " (u'records', 0.5183559396399664),\n", + " (u'woman', 0.5183559396399664),\n", + " (u'honesty', 0.51731221687018003),\n", + " (u'cent', 0.51717897687248238),\n", + " (u'curtis', 0.51717897687248238),\n", + " (u'daniel', 0.51717897687248238),\n", + " (u'examination', 0.51717897687248238),\n", + " (u'harris', 0.51717897687248238),\n", + " (u'st', 0.51717897687248238),\n", + " (u'o', 0.51599086537547656),\n", + " (u'white', 0.51014351178890438),\n", + " (u'law', 0.5078144592914029),\n", + " (u'our', 0.5025203678086001),\n", + " (u'cast', 0.50045126022927233),\n", + " (u'thousand', 0.50045126022927233),\n", + " (u'assembly', 0.49933052273546652),\n", + " (u'dr', 0.49933052273546652),\n", + " (u'recollections', 0.49933052273546652),\n", + " (u'whigs', 0.49933052273546652),\n", + " (u'capital', 0.49847712589366155),\n", + " (u'black', 0.49381656235726812),\n", + " (u'attorney', 0.49089323167751253),\n", + " (u'verdict', 0.48882938154224975),\n", + " (u'home', 0.4875980689579178),\n", + " (u'cause', 0.48750152519474277),\n", + " (u'_boy', 0.47944759251401625),\n", + " (u'_ibid', 0.47944759251401625),\n", + " (u'_times_', 0.47944759251401625),\n", + " (u'_woman', 0.47944759251401625),\n", + " (u'angel', 0.47944759251401625),\n", + " (u'anticipated', 0.47944759251401625),\n", + " (u'anxiety', 0.47944759251401625),\n", + " (u'argue', 0.47944759251401625),\n", + " (u'arguments', 0.47944759251401625),\n", + " (u'atlantic', 0.47944759251401625),\n", + " (u'average', 0.47944759251401625),\n", + " (u'aversion', 0.47944759251401625),\n", + " (u'bankruptcy', 0.47944759251401625),\n", + " (u'beale', 0.47944759251401625),\n", + " (u'blackwell', 0.47944759251401625),\n", + " (u'breach', 0.47944759251401625),\n", + " (u'brougham', 0.47944759251401625),\n", + " (u'browning', 0.47944759251401625),\n", + " (u'careful', 0.47944759251401625),\n", + " (u'celebrated', 0.47944759251401625),\n", + " (u'civic', 0.47944759251401625),\n", + " (u'cloak', 0.47944759251401625),\n", + " (u'club', 0.47944759251401625),\n", + " (u'companion_', 0.47944759251401625),\n", + " (u'conception', 0.47944759251401625),\n", + " (u'conor', 0.47944759251401625),\n", + " (u'constituents', 0.47944759251401625),\n", + " (u'convincing', 0.47944759251401625),\n", + " (u'differed', 0.47944759251401625),\n", + " (u'discredited', 0.47944759251401625),\n", + " (u'dispute', 0.47944759251401625),\n", + " (u'dissolution', 0.47944759251401625),\n", + " (u'double', 0.47944759251401625),\n", + " (u'draft', 0.47944759251401625),\n", + " (u'employer', 0.47944759251401625),\n", + " (u'essay', 0.47944759251401625),\n", + " (u'etc', 0.47944759251401625),\n", + " (u'exercised', 0.47944759251401625),\n", + " (u'existed', 0.47944759251401625),\n", + " (u'farming', 0.47944759251401625),\n", + " (u'fellows', 0.47944759251401625),\n", + " (u'forge', 0.47944759251401625),\n", + " (u'fuller', 0.47944759251401625),\n", + " (u'gen', 0.47944759251401625),\n", + " (u'generation', 0.47944759251401625),\n", + " (u'giant', 0.47944759251401625),\n", + " (u'girl', 0.47944759251401625),\n", + " (u'glance', 0.47944759251401625),\n", + " (u'graham', 0.47944759251401625),\n", + " (u'grand', 0.47944759251401625),\n", + " (u'hall', 0.47944759251401625),\n", + " (u'hammond', 0.47944759251401625),\n", + " (u'handkerchief', 0.47944759251401625),\n", + " (u'hannah', 0.47944759251401625),\n", + " (u'happens', 0.47944759251401625),\n", + " (u'harding', 0.47944759251401625),\n", + " ...]" + ] + }, + "execution_count": 13, "metadata": {}, - "source": [ - "This time around, the summary is not of high quality, as it does not tell us much about the movie. In a way, this might not be the algorithms fault, rather this text simply doesn't contain one or two sentences that capture the essence of the text as in \"The Matrix\" synopsis.\n", - "\n", - "The keywords, however, managed to find some of the main characters.\n", - "\n", - "

Performance

\n", - "\n", - "We will test how the speed of the summarizer scales with the size of the dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 processor. Note that the summarizer does not support multithreading (parallel processing).\n", - "\n", - "The tests were run on the book \"Honest Abe\" by Alonzo Rothschild. Download the book in plain-text here. \n", - "\n", - "In the plot below, we see the running times together with the sizes of the datasets. To create datasets of different sizes, we have simply taken prefixes of text; in other words we take the first n characters of the book. The algorithm seems to be quadratic in time, so one needs to be careful before plugging a large dataset into the summarizer.\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - "

Text-content dependent running times

\n", - "\n", - "The running time is not only dependent on the size of the dataset. For example, summarizing \"The Matrix\" synopsis (about 36,000 characters) takes about 3.1 seconds, while summarizing 35,000 characters of this book takes about 8.5 seconds. So the former is more than twice as fast. \n", - "\n", - "One reason for this difference in running times is the data structure that is used. The algorithm represents the data using a graph, where vertices (nodes) are sentences, and then constructs weighted edges between the vertices that represent how the sentences relate to each other. This means that every piece of text will have a different graph, thus making the running times different. The size of this data structure is quadratic in the worst case (the worst case is when each vertex has an edge to every other vertex).\n", - "\n", - "Another possible reason for the difference in running times is that the problems converge at different rates, meaning that the error drops slower for some datasets than for others.\n" - ] + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "mz_keywords(text,scores=True,weighted=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When this option is used, it's possible to calculate a threshold automatically from the number of blocks. This is likely to be most useful when the number of blocks is fairly small (<10)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'gutenberg', 3.7766363961259684),\n", + " (u'tm', 3.6403066998316511),\n", + " (u'project', 3.5428530523255342),\n", + " (u'co', 3.2983688146004528),\n", + " (u'donations', 2.8613536046553563),\n", + " (u'electronic', 2.8210861922674084),\n", + " (u'access', 2.7810662866642568),\n", + " (u'refund', 2.7810662866642568),\n", + " (u'foundation', 2.7234464816769872),\n", + " (u'foxboro', 2.5477601487545121),\n", + " (u'gloves', 2.5281337853661761),\n", + " (u'e', 2.4036269322210768),\n", + " (u'york', 2.3692008259770594),\n", + " (u'edited', 2.361641829495754),\n", + " (u'_works_', 2.3445174072327686),\n", + " (u'works', 2.3426500474551113),\n", + " (u'dogskin', 2.3425994588269479),\n", + " (u'ragsdale', 2.2931552327841351),\n", + " (u'replacement', 2.2931552327841351),\n", + " (u'trunks', 2.2931552327841351),\n", + " (u'iv', 2.2510299269025058),\n", + " (u'iii', 2.2186807817292546),\n", + " (u'v', 2.2168420707754368),\n", + " (u'brokaw', 2.1699176369612583),\n", + " (u'coon', 2.1699176369612583),\n", + " (u'bonds', 2.1343080503770544),\n", + " (u'license', 2.1009287665795293),\n", + " (u'ii', 2.0892470886183649),\n", + " (u'agreement', 2.0779209847210556),\n", + " (u'almanac', 2.0060727272918055),\n", + " (u'_weekly_', 1.9794475925140163),\n", + " (u'bounded', 1.9794475925140163),\n", + " (u'format', 1.9794475925140163),\n", + " (u'millions', 1.9794475925140163),\n", + " (u'oxen', 1.9794475925140163),\n", + " (u'specie', 1.9794475925140163),\n", + " (u'archive', 1.9682995275030786),\n", + " (u'barrett', 1.9422319940872796),\n", + " (u'reminiscences', 1.9330537427622287),\n", + " (u'ebooks', 1.8984698469769548),\n", + " (u'forquer', 1.8843080503770544),\n", + " (u'parker', 1.8843080503770544),\n", + " (u'pglaf', 1.8843080503770544),\n", + " (u'ebook', 1.8838775575675983),\n", + " (u'trademark', 1.8838775575675983),\n", + " (u'paragraph', 1.8301079379685583),\n", + " (u'hardin', 1.7669683658081703),\n", + " (u'work', 1.7328354724344326),\n", + " (u'rothschild', 1.7275730939964973),\n", + " (u'org', 1.7211393195188851),\n", + " (u'attitude', 1.716230650790012),\n", + " (u'london', 1.6791112857988695),\n", + " (u'boston', 1.6754810009833907),\n", + " (u'xvi', 1.66018729770736),\n", + " (u'news', 1.6601872977073597),\n", + " (u'biographical', 1.6294643147000225),\n", + " (u'green', 1.6254512602292723),\n", + " (u'delegates', 1.6127555612626692),\n", + " (u'medium', 1.6127555612626692),\n", + " (u'scripps', 1.6127555612626692),\n", + " (u'volunteers', 1.6127555612626692),\n", + " (u'lamon', 1.6001560607245646),\n", + " (u'tarbell', 1.5897346234235084),\n", + " (u'volumes', 1.5819481863246514),\n", + " (u'bank', 1.5744728128489647),\n", + " (u'copyright', 1.5731550611734115),\n", + " (u'_via_', 1.5722781569106761),\n", + " (u'admissibility', 1.5722781569106761),\n", + " (u'advertisers', 1.5722781569106761),\n", + " (u'applicable', 1.5722781569106761),\n", + " (u'attire', 1.5722781569106761),\n", + " (u'bags', 1.5722781569106761),\n", + " (u'berries', 1.5722781569106761),\n", + " (u'breeches', 1.5722781569106761),\n", + " (u'cline', 1.5722781569106761),\n", + " (u'continuance', 1.5722781569106761),\n", + " (u'currents', 1.5722781569106761),\n", + " (u'daguerreotype', 1.5722781569106761),\n", + " (u'disclaimer', 1.5722781569106761),\n", + " (u'email', 1.5722781569106761),\n", + " (u'enrolled', 1.5722781569106761),\n", + " (u'fool', 1.5722781569106761),\n", + " (u'guineas', 1.5722781569106761),\n", + " (u'hatchet', 1.5722781569106761),\n", + " (u'instruct', 1.5722781569106761),\n", + " (u'liability', 1.5722781569106761),\n", + " (u'lonny', 1.5722781569106761),\n", + " (u'paullin', 1.5722781569106761),\n", + " (u'performing', 1.5722781569106761),\n", + " (u'plow', 1.5722781569106761),\n", + " (u'polite', 1.5722781569106761),\n", + " (u'puffs', 1.5722781569106761),\n", + " (u'rulings', 1.5722781569106761),\n", + " (u'scammon', 1.5722781569106761),\n", + " (u'tilda', 1.5722781569106761),\n", + " (u'wake', 1.5722781569106761),\n", + " (u'warranties', 1.5722781569106761),\n", + " (u'america', 1.5712271378967728),\n", + " (u'clair', 1.5712271378967728),\n", + " (u'displaying', 1.5712271378967728),\n", + " (u'forgery', 1.5712271378967728),\n", + " (u'holder', 1.5712271378967728),\n", + " (u'posted', 1.5712271378967728),\n", + " (u'sketches', 1.5712271378967728),\n", + " (u'snow', 1.5712271378967728),\n", + " (u'wore', 1.5712271378967728),\n", + " (u'http', 1.5645865830262038),\n", + " (u'journalism', 1.5399471126066209),\n", + " (u'copy', 1.5258495075146912),\n", + " (u'_early', 1.5202411939312348),\n", + " (u'armstrong', 1.5106440743450187),\n", + " (u'railroad', 1.4938165623572677),\n", + " (u'ross', 1.489097832809857),\n", + " (u'pair', 1.4791112857988695),\n", + " (u'banks', 1.4791112857988693),\n", + " (u'irelan', 1.4791112857988693),\n", + " (u'scott', 1.4791112857988693),\n", + " (u'browne', 1.4764336408243595),\n", + " (u'abraham', 1.4577679329151634),\n", + " (u'publication', 1.4490612388306794),\n", + " (u'provide', 1.4490612388306792),\n", + " (u'chiniquy', 1.4275140308616106),\n", + " (u'literary', 1.4150354420715021),\n", + " (u'rr', 1.4070491486733681),\n", + " (u'axe', 1.3967912341407889),\n", + " (u'fence', 1.3967912341407889),\n", + " (u'genuine', 1.3967912341407889),\n", + " (u'life_', 1.3941370904272503),\n", + " (u'she', 1.3923582867044937),\n", + " (u'copper', 1.3828069220574104),\n", + " (u'distributing', 1.3828069220574104),\n", + " (u'saddle', 1.3828069220574104),\n", + " (u'sons', 1.3828069220574104),\n", + " (u'_life_', 1.373910241709706),\n", + " (u'calhoun', 1.373910241709706),\n", + " (u'mother', 1.3728688332198922),\n", + " (u'college', 1.3697302821858961),\n", + " (u'nicolay', 1.3633245760231363),\n", + " (u'whitney', 1.3627575629840512),\n", + " (u'philadelphia', 1.3540886863558637),\n", + " (u'sarah', 1.3540886863558634),\n", + " (u'vi', 1.3540886863558634),\n", + " (u'harrison', 1.3476159735283106),\n", + " (u'terms', 1.3426509824683515),\n", + " (u'herndon', 1.3421892681433798),\n", + " (u'improvement', 1.329344333012155),\n", + " (u'buckskin', 1.3222046383294666),\n", + " (u'sham', 1.3222046383294666),\n", + " (u'fee', 1.3158554460066139),\n", + " (u'generosity', 1.3144503596878891),\n", + " (u'moore', 1.3144503596878887),\n", + " (u'copies', 1.3127747798184011),\n", + " (u'p', 1.309088202039181),\n", + " (u'compliance', 1.2961309813666892),\n", + " (u'constable', 1.2961309813666892),\n", + " (u'currency', 1.2961309813666892),\n", + " (u'distribution', 1.2961309813666892),\n", + " (u'harvey', 1.2961309813666892),\n", + " (u'individual', 1.2961309813666892),\n", + " (u'revolutionary', 1.2961309813666892),\n", + " (u'brooks', 1.286562189794501),\n", + " (u'chicago', 1.2700186510810929),\n", + " (u'weems', 1.2659709073661847),\n", + " (u'february', 1.2574199029295277),\n", + " (u'information', 1.2487001310514776),\n", + " (u'bridge', 1.2326416539256813),\n", + " (u'resolution', 1.2268390166084573),\n", + " (u'stoddard', 1.2268390166084573),\n", + " (u'father', 1.2254034208363418),\n", + " (u'cartwright', 1.2157428532629155),\n", + " (u'houghton', 1.2157428532629155),\n", + " (u'publishing', 1.2157428532629155),\n", + " (u'describes', 1.2157428532629153),\n", + " (u'j', 1.2115310804189017),\n", + " (u'_stories_', 1.2049337080807629),\n", + " (u'september', 1.2030636155192291),\n", + " (u'boys', 1.1974364414369618),\n", + " (u'defendants', 1.1955861748361873),\n", + " (u'per', 1.1955861748361873),\n", + " (u'permission', 1.1955861748361873),\n", + " (u'uncle', 1.1955861748361873),\n", + " (u'thomas', 1.1924565577943991),\n", + " (u'trade', 1.1918333507609624),\n", + " (u'f', 1.1915163381561049),\n", + " (u'store', 1.189052998865439),\n", + " (u'notes', 1.1850922942502753),\n", + " (u'baker', 1.1828856976412236),\n", + " (u'baddeley', 1.1681694680548835),\n", + " (u'cogdal', 1.1681694680548835),\n", + " (u'copying', 1.1681694680548835),\n", + " (u'crafton', 1.1681694680548835),\n", + " (u'defect', 1.1681694680548835),\n", + " (u'donate', 1.1681694680548835),\n", + " (u'easier', 1.1681694680548835),\n", + " (u'editions', 1.1681694680548835),\n", + " (u'hawley', 1.1681694680548835),\n", + " (u'hitchcock', 1.1681694680548835),\n", + " (u'jake', 1.1681694680548835),\n", + " (u'jewelry', 1.1681694680548835),\n", + " (u'jurors', 1.1681694680548835),\n", + " (u'lightning', 1.1681694680548835),\n", + " (u'machine', 1.1681694680548835),\n", + " (u'paragraphs', 1.1681694680548835),\n", + " (u'pg', 1.1681694680548835),\n", + " (u'pork', 1.1681694680548835),\n", + " (u'retains', 1.1681694680548835),\n", + " (u'rod', 1.1681694680548835),\n", + " (u'securities', 1.1681694680548835),\n", + " (u'status', 1.1681694680548835),\n", + " (u'trousers', 1.1681694680548835),\n", + " (u'unpublished', 1.1681694680548835),\n", + " (u'berry', 1.1644932670010606),\n", + " (u'pp', 1.1608077284905565),\n", + " (u'hanks', 1.1587285139891437),\n", + " (u'mcclure', 1.1537352404836496),\n", + " (u'her', 1.1531891574151381),\n", + " (u'hamlin', 1.1529222466025137),\n", + " (u'speeches', 1.1437050469373577),\n", + " (u'kentucky', 1.1401563236722736),\n", + " (u'johnston', 1.1368073989967304),\n", + " (u'offutt', 1.1345503657246403),\n", + " (u'dress', 1.1343080503770544),\n", + " (u'german', 1.1343080503770544),\n", + " (u'matheney', 1.1343080503770544),\n", + " (u'company', 1.1298148326748745),\n", + " (u'g', 1.128517881924167),\n", + " (u'votes', 1.1187730676938106),\n", + " (u'nine', 1.113374076177045),\n", + " (u'charles', 1.1065580194728426),\n", + " (u'note', 1.0974655406391749),\n", + " (u'deed', 1.0970926363431248),\n", + " (u'east', 1.0970926363431248),\n", + " (u'spurious', 1.0970926363431248),\n", + " (u'atkinson', 1.0970926363431244),\n", + " (u'comply', 1.0970926363431244),\n", + " (u'jewelers', 1.0970926363431244),\n", + " (u'leland', 1.0970926363431244),\n", + " (u'priest', 1.0970926363431244),\n", + " (u'soldier', 1.0970926363431244),\n", + " (u'd', 1.0936709970367389),\n", + " (u'tax', 1.0890978328098568),\n", + " (u'colonel', 1.0886122317272675),\n", + " (u'pitcher', 1.0886122317272675),\n", + " (u'spink', 1.0886122317272675),\n", + " (u'charter', 1.0886122317272673),\n", + " (u'clock', 1.0886122317272673),\n", + " (u'distribute', 1.0886122317272673),\n", + " (u'fisher', 1.0886122317272673),\n", + " (u'convention', 1.0842245322470756),\n", + " (u'plaintiff', 1.0813648643938589),\n", + " (u'island', 1.0791112857988696),\n", + " (u'voyage', 1.0772490318253176),\n", + " (u'you', 1.0716742799027257),\n", + " (u'road', 1.0587290524017576),\n", + " (u'holland', 1.05373524048365),\n", + " (u'trailor', 1.0479900750043671),\n", + " (u'limited', 1.0447190713617185),\n", + " (u'domain', 1.0399471126066209),\n", + " (u'grandfather', 1.0399471126066209),\n", + " (u'voted', 1.0399471126066209),\n", + " (u'agree', 1.0367857078081339),\n", + " (u'including', 1.0367857078081339),\n", + " (u'life', 1.0279778291629844),\n", + " (u'witness', 1.0249646422762066),\n", + " (u'james', 1.0153080476245506),\n", + " (u'stuart', 1.0149104889383316),\n", + " (u'dungee', 1.0102738780733427),\n", + " (u'john', 1.0074378828094916),\n", + " (u'surveyor', 1.0071083505332288),\n", + " (u'cross', 1.0008479040802145),\n", + " (u'dollars', 1.0002448365299736),\n", + " (u'president', 0.99828026284480487),\n", + " (u'_amount_', 0.99450922395310026),\n", + " (u'_black', 0.99450922395310026),\n", + " (u'_commercial', 0.99450922395310026),\n", + " (u'_magazine', 0.99450922395310026),\n", + " (u'_nicolay', 0.99450922395310026),\n", + " (u'_north', 0.99450922395310026),\n", + " (u'_sun_', 0.99450922395310026),\n", + " (u'accompanies', 0.99450922395310026),\n", + " (u'accordance', 0.99450922395310026),\n", + " (u'adjourning', 0.99450922395310026),\n", + " (u'advertiser', 0.99450922395310026),\n", + " (u'advertiser_', 0.99450922395310026),\n", + " (u'agnosticism', 0.99450922395310026),\n", + " (u'almanacs', 0.99450922395310026),\n", + " (u'animals', 0.99450922395310026),\n", + " (u'apparel', 0.99450922395310026),\n", + " (u'appoints', 0.99450922395310026),\n", + " (u'arbitrations', 0.99450922395310026),\n", + " (u'ascii', 0.99450922395310026),\n", + " (u'asks', 0.99450922395310026),\n", + " (u'aspirants', 0.99450922395310026),\n", + " (u'atrocious', 0.99450922395310026),\n", + " (u'attachment', 0.99450922395310026),\n", + " (u'authors', 0.99450922395310026),\n", + " (u'band', 0.99450922395310026),\n", + " (u'bargained', 0.99450922395310026),\n", + " (u'bets', 0.99450922395310026),\n", + " (u'bleeding', 0.99450922395310026),\n", + " (u'boats', 0.99450922395310026),\n", + " (u'book_', 0.99450922395310026),\n", + " (u'boss', 0.99450922395310026),\n", + " (u'bourgeois', 0.99450922395310026),\n", + " (u'bull', 0.99450922395310026),\n", + " (u'calf', 0.99450922395310026),\n", + " (u'chase', 0.99450922395310026),\n", + " (u'chicanery', 0.99450922395310026),\n", + " (u'coach', 0.99450922395310026),\n", + " (u'coins', 0.99450922395310026),\n", + " (u'comet', 0.99450922395310026),\n", + " (u'computer', 0.99450922395310026),\n", + " (u'computers', 0.99450922395310026),\n", + " (u'concentration', 0.99450922395310026),\n", + " (u'conquering', 0.99450922395310026),\n", + " (u'conservator', 0.99450922395310026),\n", + " (u'contentedly', 0.99450922395310026),\n", + " (u'copied', 0.99450922395310026),\n", + " (u'cord', 0.99450922395310026),\n", + " (u'cornell', 0.99450922395310026),\n", + " (u'countenance', 0.99450922395310026),\n", + " (u'counting', 0.99450922395310026),\n", + " (u'countryman', 0.99450922395310026),\n", + " (u'creeks', 0.99450922395310026),\n", + " (u'davy', 0.99450922395310026),\n", + " (u'deer', 0.99450922395310026),\n", + " (u'def', 0.99450922395310026),\n", + " (u'delegations', 0.99450922395310026),\n", + " (u'deliveries', 0.99450922395310026),\n", + " (u'demurrer', 0.99450922395310026),\n", + " (u'desires', 0.99450922395310026),\n", + " (u'detriment', 0.99450922395310026),\n", + " (u'directors', 0.99450922395310026),\n", + " (u'disallows', 0.99450922395310026),\n", + " (u'disgracing', 0.99450922395310026),\n", + " (u'doctoring', 0.99450922395310026),\n", + " (u'effectively', 0.99450922395310026),\n", + " (u'elections', 0.99450922395310026),\n", + " (u'electronically', 0.99450922395310026),\n", + " (u'enrolling', 0.99450922395310026),\n", + " (u'exempt', 0.99450922395310026),\n", + " (u'faded', 0.99450922395310026),\n", + " (u'fares', 0.99450922395310026),\n", + " (u'ff', 0.99450922395310026),\n", + " (u'fights', 0.99450922395310026),\n", + " (u'flatboat', 0.99450922395310026),\n", + " (u'founded', 0.99450922395310026),\n", + " (u'generals', 0.99450922395310026),\n", + " (u'goose', 0.99450922395310026),\n", + " (u'greed', 0.99450922395310026),\n", + " (u'groomsman', 0.99450922395310026),\n", + " (u'hagerty', 0.99450922395310026),\n", + " (u'hans', 0.99450922395310026),\n", + " (u'harvard', 0.99450922395310026),\n", + " (u'haute', 0.99450922395310026),\n", + " (u'heel', 0.99450922395310026),\n", + " (u'history_', 0.99450922395310026),\n", + " (u'homeliest', 0.99450922395310026),\n", + " (u'howard', 0.99450922395310026),\n", + " (u'hut', 0.99450922395310026),\n", + " (u'ice', 0.99450922395310026),\n", + " (u'ida', 0.99450922395310026),\n", + " (u'identical', 0.99450922395310026),\n", + " (u'imperialist', 0.99450922395310026),\n", + " (u'independent', 0.99450922395310026),\n", + " (u'invalid', 0.99450922395310026),\n", + " (u'irons', 0.99450922395310026),\n", + " (u'janet', 0.99450922395310026),\n", + " (u'justification', 0.99450922395310026),\n", + " (u'lamborn', 0.99450922395310026),\n", + " (u'lambs', 0.99450922395310026),\n", + " (u'larceny', 0.99450922395310026),\n", + " (u'latin', 0.99450922395310026),\n", + " (u'linen', 0.99450922395310026),\n", + " (u'locations', 0.99450922395310026),\n", + " (u'louder', 0.99450922395310026),\n", + " (u'mad', 0.99450922395310026),\n", + " (u'magruder', 0.99450922395310026),\n", + " (u'maid', 0.99450922395310026),\n", + " (u'metaphysical', 0.99450922395310026),\n", + " (u'mit', 0.99450922395310026),\n", + " (u'monthlies', 0.99450922395310026),\n", + " (u'nest', 0.99450922395310026),\n", + " (u'nigger', 0.99450922395310026),\n", + " (u'package', 0.99450922395310026),\n", + " (u'pan', 0.99450922395310026),\n", + " (u'parentage', 0.99450922395310026),\n", + " (u'partial', 0.99450922395310026),\n", + " (u'partly', 0.99450922395310026),\n", + " (u'passengers', 0.99450922395310026),\n", + " (u'pension', 0.99450922395310026),\n", + " (u'pl', 0.99450922395310026),\n", + " (u'playful', 0.99450922395310026),\n", + " (u'population', 0.99450922395310026),\n", + " (u'postponed', 0.99450922395310026),\n", + " (u'postponement', 0.99450922395310026),\n", + " (u'premise', 0.99450922395310026),\n", + " (u'pressure', 0.99450922395310026),\n", + " (u'presumption', 0.99450922395310026),\n", + " (u'preventing', 0.99450922395310026),\n", + " (u'quart', 0.99450922395310026),\n", + " (u'quincy', 0.99450922395310026),\n", + " (u'quorum', 0.99450922395310026),\n", + " (u'redistribution', 0.99450922395310026),\n", + " (u'rejoicing', 0.99450922395310026),\n", + " (u'remit', 0.99450922395310026),\n", + " (u'rifle', 0.99450922395310026),\n", + " (u'romance', 0.99450922395310026),\n", + " (u'rothschild_', 0.99450922395310026),\n", + " (u'row', 0.99450922395310026),\n", + " (u'rubbish', 0.99450922395310026),\n", + " (u'sacrifices', 0.99450922395310026),\n", + " (u'scroll', 0.99450922395310026),\n", + " (u'shade', 0.99450922395310026),\n", + " (u'shed', 0.99450922395310026),\n", + " (u'sigh', 0.99450922395310026),\n", + " (u'silk', 0.99450922395310026),\n", + " (u'sinewy', 0.99450922395310026),\n", + " (u'sock', 0.99450922395310026),\n", + " (u'solicit', 0.99450922395310026),\n", + " (u'solvent', 0.99450922395310026),\n", + " (u'sonny', 0.99450922395310026),\n", + " (u'startling', 0.99450922395310026),\n", + " (u'steals', 0.99450922395310026),\n", + " (u'steamer', 0.99450922395310026),\n", + " (u'stevenson', 0.99450922395310026),\n", + " (u'subp\\u0153naed', 0.99450922395310026),\n", + " (u'tanned', 0.99450922395310026),\n", + " (u'tea', 0.99450922395310026),\n", + " (u'terre', 0.99450922395310026),\n", + " (u'theosophy', 0.99450922395310026),\n", + " (u'tight', 0.99450922395310026),\n", + " (u'tis', 0.99450922395310026),\n", + " (u'tour', 0.99450922395310026),\n", + " (u'vanilla', 0.99450922395310026),\n", + " (u'vol', 0.99450922395310026),\n", + " (u'warfare', 0.99450922395310026),\n", + " (u'warranty', 0.99450922395310026),\n", + " (u'wayne', 0.99450922395310026),\n", + " (u'whip', 0.99450922395310026),\n", + " (u'woodcut', 0.99450922395310026),\n", + " (u'wright', 0.99450922395310026),\n", + " (u'new', 0.99212250974463601)]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mz_keywords(text,scores=True,weighted=False,threshold='auto')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The expected complexity of this algorithm is **O**(*Nw*), where *N* is the number of words in the text, and *w* is the number of unique words in the text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } - ] -} \ No newline at end of file + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index 8f990b68e6..e60cce8e0e 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -9,64 +9,111 @@ import numpy import scipy -def mz_keywords(text,blocksize=1024,scores=False,split=False,weighted=True,threshold=0.0): - """Extract keywords from text using the Montemurro and Zanette entropy algorithm. - https://arxiv.org/abs/0907.1558 - :param text: str (document to summarize) - :param blocksize: int (size of blocks to use in analysis) - :params scores: bool (return score with keywords) - :params split: bool (return results as list) - :params weighted: bool (weight scores by word frequency. - False is useful for shorter texts) - :params threshold: float or 'auto' (minimum score for returned keywords - 'auto' calculates the threshold as - nblocks/(nblocks+1.0) - Use 'auto' with weighted=False)""" - text=to_unicode(text) - words=[word for word in _tokenize_by_word(text)] - vocab=sorted(set(words)) - wordcounts=numpy.array([[words[i:i+blocksize].count(word) for word in vocab] - for i in range(0,len(words),blocksize)]).astype('d') - nblocks=wordcounts.shape[0] - totals=wordcounts.sum(axis=0) - nwords=totals.sum() - p=wordcounts/totals - logp=numpy.log2(p) - H=numpy.nan_to_num((p*logp),0.0).sum(axis=0) +def mz_keywords(text, + blocksize=1024, + scores=False, + split=False, + weighted=True, + threshold=0.0): + """Extract keywords from text using the Montemurro and Zanette entropy + algorithm. [1]_ + + Parameters + ---------- + text: str + document to summarize + blocksize: int, optional + size of blocks to use in analysis, default is 1024 + scores: bool, optional + Whether to return score with keywords, default is False + split: bool, optional + Whether to return results as list, default is False + weighted: bool, optional + Whether to weight scores by word frequency. Default is True. + False can useful for shorter texts, and allows automatic thresholding + threshold: float or 'auto', optional + minimum score for returned keywords, default 0.0 + 'auto' calculates the threshold as nblocks / (nblocks + 1.0) + 1.0e-8 + Use 'auto' with weighted=False) + + Returns + ------- + results: str + newline separated keywords if split is False OR + results: list(str) + list of keywords if scores is False OR + results: list(tuple(str, float)) + list of (keyword, score) tuples if scores is True + + Results are returned in descending order of score regardless of the format. - def log_combinations(n,m): - """Calculates the logarithm of n!/m!(n-m)!""" - return -(numpy.log(n+1)+scipy.special.betaln(n-m+1,m+1)) + Notes + ----- + This algorithm looks for keywords that contribute to the structure of the + text on scales of blocksize words of larger. It is suitable for extracting + keywords representing the major themes of long texts. + References + ---------- + [1] Marcello A Montemurro, Damian Zanette, + "Towards the quantification of the semantic information encoded in + written language" + Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153 + DOI: 10.1142/S0219525910002530 + https://arxiv.org/abs/0907.1558 + + """ + text = to_unicode(text) + words = [word for word in _tokenize_by_word(text)] + vocab = sorted(set(words)) + wordcounts = numpy.array([[words[i:i+blocksize].count(word) + for word in vocab] + for i in range(0, + len(words), + blocksize)]).astype('d') + nblocks = wordcounts.shape[0] + totals = wordcounts.sum(axis=0) + nwords = totals.sum() + p = wordcounts / totals + logp = numpy.log2(p) + H = numpy.nan_to_num((p * logp), 0.0).sum(axis=0) + analytic = __analytic_entropy(blocksize, nblocks, nwords) + H += analytic(totals).astype('d') + if weighted: + H *= totals / nwords + if threshold == 'auto': + threshold = nblocks / (nblocks + 1.0) + 1.0e-8 + weights = [(word, score) + for (word, score) in zip(vocab, H) + if score>threshold] + weights.sort(key = lambda x:-x[1]) + result = weights if scores else [word for (word, score) in weights] + if not (scores or split): + result = '\n'.join(result) + return result + + +def __log_combinations_inner(n, m): + """Calculates the logarithm of n!/m!(n-m)!""" + return -(numpy.log(n+1)+scipy.special.betaln(n-m+1,m+1)) + +__log_combinations=numpy.frompyfunc(__log_combinations_inner, 2, 1) + +def __marginal_prob(blocksize, nwords): def marginal_prob(n,m): """Marginal probability of a word that occurs n times in the document occurring m times in a given block""" - return numpy.exp(log_combinations(n,m) - +log_combinations(nwords-n,blocksize-m) - -log_combinations(nwords,blocksize)) - - marginal=numpy.frompyfunc(marginal_prob,2,1) - + return numpy.exp(__log_combinations(n, m) + + __log_combinations(nwords - n, blocksize - m) + - __log_combinations(nwords, blocksize)) + return numpy.frompyfunc(marginal_prob, 2, 1) + +def __analytic_entropy(blocksize, nblocks, nwords): + marginal=__marginal_prob(blocksize, nwords) def analytic_entropy(n): """Predicted entropy for a word that occurs n times in the document""" - m=numpy.arange(1,min(blocksize,n)+1).astype('d') - p=m/n - elements=p*numpy.nan_to_num(numpy.log2(p),0.0)*marginal(n,m) - return -nblocks*elements.sum() - - analytic=numpy.frompyfunc(analytic_entropy,1,1) - H+=analytic(totals).astype('d') - if weighted: - H*=totals/nwords - if threshold=='auto': - threshold=nblocks/(nblocks+1.0) - weights=[(word,score) - for (word,score) in zip(vocab,H) - if score>threshold] - weights.sort(key=lambda x:-x[1]) - result= weights if scores else [word for (word,score) in weights] - if not (scores or split): - result='\n'.join(result) - return result - - \ No newline at end of file + m = numpy.arange(1, min(blocksize, n) + 1).astype('d') + p = m/n + elements = p * numpy.nan_to_num(numpy.log2(p), 0.0) * marginal(n,m) + return -nblocks * elements.sum() + return numpy.frompyfunc(analytic_entropy,1,1) \ No newline at end of file diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 7255252a57..1f49f39599 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -18,7 +18,7 @@ from gensim import utils from gensim.corpora import Dictionary -from gensim.summarization import summarize, summarize_corpus, keywords +from gensim.summarization import summarize, summarize_corpus, keywords, mz_keywords class TestSummarizationTest(unittest.TestCase): @@ -144,6 +144,22 @@ def test_keywords_runs(self): kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst)) + + def test_mz_keywords(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f: + text = f.read() + + kwds = mz_keywords(text) + self.assertTrue(len(kwds.splitlines())) + + kwds_u = mz_keywords(utils.to_unicode(text)) + self.assertTrue(len(kwds_u.splitlines())) + + kwds_lst = mz_keywords(text, split=True) + self.assertTrue(len(kwds_lst)) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From e763f3cf5619f69ee74970ac4af4853de824619c Mon Sep 17 00:00:00 2001 From: Pete Date: Mon, 27 Nov 2017 12:17:33 +0000 Subject: [PATCH 05/26] I hate git --- docs/notebooks/summarization_tutorial.ipynb | 1967 ------------------- 1 file changed, 1967 deletions(-) delete mode 100644 docs/notebooks/summarization_tutorial.ipynb diff --git a/docs/notebooks/summarization_tutorial.ipynb b/docs/notebooks/summarization_tutorial.ipynb deleted file mode 100644 index 0d49eb95c2..0000000000 --- a/docs/notebooks/summarization_tutorial.ipynb +++ /dev/null @@ -1,1967 +0,0 @@ -{ - "metadata": { - "name": "", - "signature": "sha256:6b9b76544213a02f8bf906cdada222aa43d1d502664b11cd363728bc96c21b5f" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "

Tutorial: automatic summarization using Gensim

\n", - "\n", - "This module automatically summarizes the given text, by extracting one or more important sentences from the text. In a similar way, it can also extract keywords. This tutorial will teach you to use this summarization module via some examples. First, we will try a small example, then we will try two larger ones, and then we will review the performance of the summarizer in terms of speed.\n", - "\n", - "This summarizer is based on the \"TextRank\" algorithm, from an [article](http://web.eecs.umich.edu/%7Emihalcea/papers/mihalcea.emnlp04.pdf) by Mihalcea et al. This algorithm was later improved upon by Barrios et al. in another [article](https://raw.githubusercontent.com/summanlp/docs/master/articulo/articulo-en.pdf), by introducing something called a \"BM25 ranking function\". \n", - "\n", - "This tutorial assumes that you are familiar with Python and have [installed Gensim](http://radimrehurek.com/gensim/install.html).\n", - "\n", - "Note: Gensim's summarization only works for English for now, because the text is pre-processed so that stopwords are removed and the words are stemmed, and these processes are language-dependent.\n", - "\n", - "\n", - "

Small example

\n", - "\n", - "First of all, we import the function \"summarize\"." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,358 : INFO : Pattern library is not installed, lemmatization won't be available.\n", - "2016-09-19 12:45:22,361 : INFO : Could not import Theano, will use standard float for default ShardedCorpus dtype.\n", - "2016-09-19 12:45:22,372 : INFO : 'pattern' package not found; tag filters are not available for English\n" - ] - } - ], - "source": [ - "import logging\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", - "\n", - "from gensim.summarization import summarize" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input text:\n", - "Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion. \n" - ] - } - ], - "source": [ - "text = \"Thomas A. Anderson is a man living two lives. By day he is an \" + \\\n", - " \"average computer programmer and by night a hacker known as \" + \\\n", - " \"Neo. Neo has always questioned his reality, but the truth is \" + \\\n", - " \"far beyond his imagination. Neo finds himself targeted by the \" + \\\n", - " \"police when he is contacted by Morpheus, a legendary computer \" + \\\n", - " \"hacker branded a terrorist by the government. Morpheus awakens \" + \\\n", - " \"Neo to the real world, a ravaged wasteland where most of \" + \\\n", - " \"humanity have been captured by a race of machines that live \" + \\\n", - " \"off of the humans' body heat and electrochemical energy and \" + \\\n", - " \"who imprison their minds within an artificial reality known as \" + \\\n", - " \"the Matrix. As a rebel against the machines, Neo must return to \" + \\\n", - " \"the Matrix and confront the agents: super-powerful computer \" + \\\n", - " \"programs devoted to snuffing out Neo and the entire human \" + \\\n", - " \"rebellion. \"\n", - "\n", - "print ('Input text:')\n", - "print (text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To summarize this text, we pass the raw string data as input to the function \"summarize\", and it will return a summary.\n", - "\n", - "Note: make sure that the string does not contain any newlines where the line breaks in a sentence. A sentence with a newline in it (i.e. a carriage return, \"\\n\") will be treated as two sentences." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,405 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,405 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,406 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,406 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" - ] - } - ], - "source": [ - "print ('Summary:')\n", - "print (summarize(text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the \"split\" option if you want a list of strings instead of a single string." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,428 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,428 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,429 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,430 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\"]\n" - ] - } - ], - "source": [ - "print (summarize(text, split=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can adjust how much text the summarizer outputs via the \"ratio\" parameter or the \"word_count\" parameter. Using the \"ratio\" parameter, you specify what fraction of sentences in the original text should be returned as output. Below we specify that we want 50% of the original text (the default is 20%)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,446 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,446 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,447 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,447 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n", - "Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government.\n", - "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" - ] - } - ], - "source": [ - "print ('Summary:')\n", - "print (summarize(text, ratio=0.5))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the \"word_count\" parameter, we specify the maximum amount of words we want in the summary. Below we have specified that we want no more than 50 words." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,463 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,464 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,464 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,465 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Keywords:\n", - "humanity\n", - "human\n", - "neo\n", - "humans body\n", - "super\n", - "reality\n", - "hacker\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "Anderson, a software engineer for a Metacortex, the other life as Neo, a computer hacker \"guilty of virtually every computer crime we have a law for.\" Agent Smith asks him to help them capture Morpheus, a dangerous terrorist, in exchange for amnesty.\n", - "Morpheus explains that he's been searching for Neo his entire life and asks if Neo feels like \"Alice in Wonderland, falling down the rabbit hole.\" He explains to Neo that they exist in the Matrix, a false reality that has been constructed for humans to hide the truth.\n", - "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse (Matt Doran).\n", - "Cypher cuts up a juicy steak and ruminates that he knows the steak is merely the simulation telling his brain that it is delicious and juicy, but after nine years he has discovered that \"ignorance is bliss.\" He strikes a deal for the machines to reinsert his body into a power plant, reinsert him into the Matrix, and he'll help the Agents.\n", - "\n", - "Keywords:\n", - "neo\n", - "morpheus\n", - "trinity\n", - "cypher\n", - "agents\n", - "agent\n", - "smith\n", - "tank\n", - "says\n", - "saying\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", - "\n", - "print 'Summary:'\n", - "print summarize(text, ratio=0.01)\n", - "\n", - "print '\\nKeywords:'\n", - "print keywords(text, ratio=0.01)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you know this movie, you see that this summary is actually quite good. We also see that some of the most important characters (Neo, Morpheus, Trinity) were extracted as keywords.\n", - "\n", - "

Another example

\n", - "\n", - "Let's try an example similar to the one above. This time, we will use the [IMDb synopsis](http://www.imdb.com/title/tt0118715/synopsis?ref_=tt_stry_pl) of \"The Big Lebowski\".\n", - "\n", - "Again, we download the text and produce a summary and some keywords." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,510 : INFO : Starting new HTTP connection (1): rare-technologies.com\n", - "2016-09-19 12:45:23,035 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:23,042 : INFO : built Dictionary(1093 unique tokens: ['realiti', 'keanu', 'miseri', 'vestig', 'massiv']...) from 416 documents (total 2985 corpus positions)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "Anderson, a software engineer for a Metacortex, the other life as Neo, a computer hacker \"guilty of virtually every computer crime we have a law for.\" Agent Smith asks him to help them capture Morpheus, a dangerous terrorist, in exchange for amnesty.\n", - "Morpheus explains that he's been searching for Neo his entire life and asks if Neo feels like \"Alice in Wonderland, falling down the rabbit hole.\" He explains to Neo that they exist in the Matrix, a false reality that has been constructed for humans to hide the truth.\n", - "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse (Matt Doran).\n", - "Cypher cuts up a juicy steak and ruminates that he knows the steak is merely the simulation telling his brain that it is delicious and juicy, but after nine years he has discovered that \"ignorance is bliss.\" He strikes a deal for the machines to reinsert his body into a power plant, reinsert him into the Matrix, and he'll help the Agents.\n", - "\n", - "Keywords:\n", - "neo\n", - "morpheus\n", - "trinity\n", - "cypher\n", - "agents\n", - "agent\n", - "smith\n", - "tank\n", - "says\n", - "saying\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", - "\n", - "print ('Summary:')\n", - "print (summarize(text, ratio=0.01))\n", - "\n", - "print ('\\nKeywords:')\n", - "print (keywords(text, ratio=0.01))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you know this movie, you see that this summary is actually quite good. We also see that some of the most important characters (Neo, Morpheus, Trinity) were extracted as keywords.\n", - "\n", - "

Another example

\n", - "\n", - "Let's try an example similar to the one above. This time, we will use the [IMDb synopsis](http://www.imdb.com/title/tt0118715/synopsis?ref_=tt_stry_pl) of \"The Big Lebowski\".\n", - "\n", - "Again, we download the text and produce a summary and some keywords." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:25,227 : INFO : Starting new HTTP connection (1): rare-technologies.com\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\n", - "\n", - "print ('Summary:')\n", - "print (summarize(text, ratio=0.01))\n", - "\n", - "print ('\\nKeywords:')\n", - "print (keywords(text, ratio=0.01))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time around, the summary is not of high quality, as it does not tell us much about the movie. In a way, this might not be the algorithms fault, rather this text simply doesn't contain one or two sentences that capture the essence of the text as in \"The Matrix\" synopsis.\n", - "\n", - "The keywords, however, managed to find some of the main characters.\n", - "\n", - "

Performance

\n", - "\n", - "We will test how the speed of the summarizer scales with the size of the dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 processor. Note that the summarizer does not support multithreading (parallel processing).\n", - "\n", - "The tests were run on the book \"Honest Abe\" by Alonzo Rothschild. Download the book in plain-text here. \n", - "\n", - "In the plot below, we see the running times together with the sizes of the datasets. To create datasets of different sizes, we have simply taken prefixes of text; in other words we take the first n characters of the book. The algorithm seems to be quadratic in time, so one needs to be careful before plugging a large dataset into the summarizer.\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - "

Text-content dependent running times

\n", - "\n", - "The running time is not only dependent on the size of the dataset. For example, summarizing \"The Matrix\" synopsis (about 36,000 characters) takes about 3.1 seconds, while summarizing 35,000 characters of this book takes about 8.5 seconds. So the former is more than twice as fast. \n", - "\n", - "One reason for this difference in running times is the data structure that is used. The algorithm represents the data using a graph, where vertices (nodes) are sentences, and then constructs weighted edges between the vertices that represent how the sentences relate to each other. This means that every piece of text will have a different graph, thus making the running times different. The size of this data structure is quadratic in the worst case (the worst case is when each vertex has an edge to every other vertex).\n", - "\n", - "Another possible reason for the difference in running times is that the problems converge at different rates, meaning that the error drops slower for some datasets than for others.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[(u'gutenberg', 3.7766363961259684),\n", - " (u'tm', 3.6403066998316511),\n", - " (u'project', 3.5428530523255342),\n", - " (u'co', 3.2983688146004528),\n", - " (u'donations', 2.8613536046553563),\n", - " (u'electronic', 2.8210861922674084),\n", - " (u'access', 2.7810662866642568),\n", - " (u'refund', 2.7810662866642568),\n", - " (u'foundation', 2.7234464816769872),\n", - " (u'foxboro', 2.5477601487545121),\n", - " (u'gloves', 2.5281337853661761),\n", - " (u'e', 2.4036269322210768),\n", - " (u'york', 2.3692008259770594),\n", - " (u'edited', 2.361641829495754),\n", - " (u'_works_', 2.3445174072327686),\n", - " (u'works', 2.3426500474551113),\n", - " (u'dogskin', 2.3425994588269479),\n", - " (u'ragsdale', 2.2931552327841351),\n", - " (u'replacement', 2.2931552327841351),\n", - " (u'trunks', 2.2931552327841351),\n", - " (u'iv', 2.2510299269025058),\n", - " (u'iii', 2.2186807817292546),\n", - " (u'v', 2.2168420707754368),\n", - " (u'brokaw', 2.1699176369612583),\n", - " (u'coon', 2.1699176369612583),\n", - " (u'bonds', 2.1343080503770544),\n", - " (u'license', 2.1009287665795293),\n", - " (u'ii', 2.0892470886183649),\n", - " (u'agreement', 2.0779209847210556),\n", - " (u'almanac', 2.0060727272918055),\n", - " (u'_weekly_', 1.9794475925140163),\n", - " (u'bounded', 1.9794475925140163),\n", - " (u'format', 1.9794475925140163),\n", - " (u'millions', 1.9794475925140163),\n", - " (u'oxen', 1.9794475925140163),\n", - " (u'specie', 1.9794475925140163),\n", - " (u'archive', 1.9682995275030786),\n", - " (u'barrett', 1.9422319940872796),\n", - " (u'reminiscences', 1.9330537427622287),\n", - " (u'ebooks', 1.8984698469769548),\n", - " (u'forquer', 1.8843080503770544),\n", - " (u'parker', 1.8843080503770544),\n", - " (u'pglaf', 1.8843080503770544),\n", - " (u'ebook', 1.8838775575675983),\n", - " (u'trademark', 1.8838775575675983),\n", - " (u'paragraph', 1.8301079379685583),\n", - " (u'hardin', 1.7669683658081703),\n", - " (u'work', 1.7328354724344326),\n", - " (u'rothschild', 1.7275730939964973),\n", - " (u'org', 1.7211393195188851),\n", - " (u'attitude', 1.716230650790012),\n", - " (u'london', 1.6791112857988695),\n", - " (u'boston', 1.6754810009833907),\n", - " (u'xvi', 1.66018729770736),\n", - " (u'news', 1.6601872977073597),\n", - " (u'biographical', 1.6294643147000225),\n", - " (u'green', 1.6254512602292723),\n", - " (u'delegates', 1.6127555612626692),\n", - " (u'medium', 1.6127555612626692),\n", - " (u'scripps', 1.6127555612626692),\n", - " (u'volunteers', 1.6127555612626692),\n", - " (u'lamon', 1.6001560607245646),\n", - " (u'tarbell', 1.5897346234235084),\n", - " (u'volumes', 1.5819481863246514),\n", - " (u'bank', 1.5744728128489647),\n", - " (u'copyright', 1.5731550611734115),\n", - " (u'_via_', 1.5722781569106761),\n", - " (u'admissibility', 1.5722781569106761),\n", - " (u'advertisers', 1.5722781569106761),\n", - " (u'applicable', 1.5722781569106761),\n", - " (u'attire', 1.5722781569106761),\n", - " (u'bags', 1.5722781569106761),\n", - " (u'berries', 1.5722781569106761),\n", - " (u'breeches', 1.5722781569106761),\n", - " (u'cline', 1.5722781569106761),\n", - " (u'continuance', 1.5722781569106761),\n", - " (u'currents', 1.5722781569106761),\n", - " (u'daguerreotype', 1.5722781569106761),\n", - " (u'disclaimer', 1.5722781569106761),\n", - " (u'email', 1.5722781569106761),\n", - " (u'enrolled', 1.5722781569106761),\n", - " (u'fool', 1.5722781569106761),\n", - " (u'guineas', 1.5722781569106761),\n", - " (u'hatchet', 1.5722781569106761),\n", - " (u'instruct', 1.5722781569106761),\n", - " (u'liability', 1.5722781569106761),\n", - " (u'lonny', 1.5722781569106761),\n", - " (u'paullin', 1.5722781569106761),\n", - " (u'performing', 1.5722781569106761),\n", - " (u'plow', 1.5722781569106761),\n", - " (u'polite', 1.5722781569106761),\n", - " (u'puffs', 1.5722781569106761),\n", - " (u'rulings', 1.5722781569106761),\n", - " (u'scammon', 1.5722781569106761),\n", - " (u'tilda', 1.5722781569106761),\n", - " (u'wake', 1.5722781569106761),\n", - " (u'warranties', 1.5722781569106761),\n", - " (u'america', 1.5712271378967728),\n", - " (u'clair', 1.5712271378967728),\n", - " (u'displaying', 1.5712271378967728),\n", - " (u'forgery', 1.5712271378967728),\n", - " (u'holder', 1.5712271378967728),\n", - " (u'posted', 1.5712271378967728),\n", - " (u'sketches', 1.5712271378967728),\n", - " (u'snow', 1.5712271378967728),\n", - " (u'wore', 1.5712271378967728),\n", - " (u'http', 1.5645865830262038),\n", - " (u'journalism', 1.5399471126066209),\n", - " (u'copy', 1.5258495075146912),\n", - " (u'_early', 1.5202411939312348),\n", - " (u'armstrong', 1.5106440743450187),\n", - " (u'railroad', 1.4938165623572677),\n", - " (u'ross', 1.489097832809857),\n", - " (u'pair', 1.4791112857988695),\n", - " (u'banks', 1.4791112857988693),\n", - " (u'irelan', 1.4791112857988693),\n", - " (u'scott', 1.4791112857988693),\n", - " (u'browne', 1.4764336408243595),\n", - " (u'abraham', 1.4577679329151634),\n", - " (u'publication', 1.4490612388306794),\n", - " (u'provide', 1.4490612388306792),\n", - " (u'chiniquy', 1.4275140308616106),\n", - " (u'literary', 1.4150354420715021),\n", - " (u'rr', 1.4070491486733681),\n", - " (u'axe', 1.3967912341407889),\n", - " (u'fence', 1.3967912341407889),\n", - " (u'genuine', 1.3967912341407889),\n", - " (u'life_', 1.3941370904272503),\n", - " (u'she', 1.3923582867044937),\n", - " (u'copper', 1.3828069220574104),\n", - " (u'distributing', 1.3828069220574104),\n", - " (u'saddle', 1.3828069220574104),\n", - " (u'sons', 1.3828069220574104),\n", - " (u'_life_', 1.373910241709706),\n", - " (u'calhoun', 1.373910241709706),\n", - " (u'mother', 1.3728688332198922),\n", - " (u'college', 1.3697302821858961),\n", - " (u'nicolay', 1.3633245760231363),\n", - " (u'whitney', 1.3627575629840512),\n", - " (u'philadelphia', 1.3540886863558637),\n", - " (u'sarah', 1.3540886863558634),\n", - " (u'vi', 1.3540886863558634),\n", - " (u'harrison', 1.3476159735283106),\n", - " (u'terms', 1.3426509824683515),\n", - " (u'herndon', 1.3421892681433798),\n", - " (u'improvement', 1.329344333012155),\n", - " (u'buckskin', 1.3222046383294666),\n", - " (u'sham', 1.3222046383294666),\n", - " (u'fee', 1.3158554460066139),\n", - " (u'generosity', 1.3144503596878891),\n", - " (u'moore', 1.3144503596878887),\n", - " (u'copies', 1.3127747798184011),\n", - " (u'p', 1.309088202039181),\n", - " (u'compliance', 1.2961309813666892),\n", - " (u'constable', 1.2961309813666892),\n", - " (u'currency', 1.2961309813666892),\n", - " (u'distribution', 1.2961309813666892),\n", - " (u'harvey', 1.2961309813666892),\n", - " (u'individual', 1.2961309813666892),\n", - " (u'revolutionary', 1.2961309813666892),\n", - " (u'brooks', 1.286562189794501),\n", - " (u'chicago', 1.2700186510810929),\n", - " (u'weems', 1.2659709073661847),\n", - " (u'february', 1.2574199029295277),\n", - " (u'information', 1.2487001310514776),\n", - " (u'bridge', 1.2326416539256813),\n", - " (u'resolution', 1.2268390166084573),\n", - " (u'stoddard', 1.2268390166084573),\n", - " (u'father', 1.2254034208363418),\n", - " (u'cartwright', 1.2157428532629155),\n", - " (u'houghton', 1.2157428532629155),\n", - " (u'publishing', 1.2157428532629155),\n", - " (u'describes', 1.2157428532629153),\n", - " (u'j', 1.2115310804189017),\n", - " (u'_stories_', 1.2049337080807629),\n", - " (u'september', 1.2030636155192291),\n", - " (u'boys', 1.1974364414369618),\n", - " (u'defendants', 1.1955861748361873),\n", - " (u'per', 1.1955861748361873),\n", - " (u'permission', 1.1955861748361873),\n", - " (u'uncle', 1.1955861748361873),\n", - " (u'thomas', 1.1924565577943991),\n", - " (u'trade', 1.1918333507609624),\n", - " (u'f', 1.1915163381561049),\n", - " (u'store', 1.189052998865439),\n", - " (u'notes', 1.1850922942502753),\n", - " (u'baker', 1.1828856976412236),\n", - " (u'baddeley', 1.1681694680548835),\n", - " (u'cogdal', 1.1681694680548835),\n", - " (u'copying', 1.1681694680548835),\n", - " (u'crafton', 1.1681694680548835),\n", - " (u'defect', 1.1681694680548835),\n", - " (u'donate', 1.1681694680548835),\n", - " (u'easier', 1.1681694680548835),\n", - " (u'editions', 1.1681694680548835),\n", - " (u'hawley', 1.1681694680548835),\n", - " (u'hitchcock', 1.1681694680548835),\n", - " (u'jake', 1.1681694680548835),\n", - " (u'jewelry', 1.1681694680548835),\n", - " (u'jurors', 1.1681694680548835),\n", - " (u'lightning', 1.1681694680548835),\n", - " (u'machine', 1.1681694680548835),\n", - " (u'paragraphs', 1.1681694680548835),\n", - " (u'pg', 1.1681694680548835),\n", - " (u'pork', 1.1681694680548835),\n", - " (u'retains', 1.1681694680548835),\n", - " (u'rod', 1.1681694680548835),\n", - " (u'securities', 1.1681694680548835),\n", - " (u'status', 1.1681694680548835),\n", - " (u'trousers', 1.1681694680548835),\n", - " (u'unpublished', 1.1681694680548835),\n", - " (u'berry', 1.1644932670010606),\n", - " (u'pp', 1.1608077284905565),\n", - " (u'hanks', 1.1587285139891437),\n", - " (u'mcclure', 1.1537352404836496),\n", - " (u'her', 1.1531891574151381),\n", - " (u'hamlin', 1.1529222466025137),\n", - " (u'speeches', 1.1437050469373577),\n", - " (u'kentucky', 1.1401563236722736),\n", - " (u'johnston', 1.1368073989967304),\n", - " (u'offutt', 1.1345503657246403),\n", - " (u'dress', 1.1343080503770544),\n", - " (u'german', 1.1343080503770544),\n", - " (u'matheney', 1.1343080503770544),\n", - " (u'company', 1.1298148326748745),\n", - " (u'g', 1.128517881924167),\n", - " (u'votes', 1.1187730676938106),\n", - " (u'nine', 1.113374076177045),\n", - " (u'charles', 1.1065580194728426),\n", - " (u'note', 1.0974655406391749),\n", - " (u'deed', 1.0970926363431248),\n", - " (u'east', 1.0970926363431248),\n", - " (u'spurious', 1.0970926363431248),\n", - " (u'atkinson', 1.0970926363431244),\n", - " (u'comply', 1.0970926363431244),\n", - " (u'jewelers', 1.0970926363431244),\n", - " (u'leland', 1.0970926363431244),\n", - " (u'priest', 1.0970926363431244),\n", - " (u'soldier', 1.0970926363431244),\n", - " (u'd', 1.0936709970367389),\n", - " (u'tax', 1.0890978328098568),\n", - " (u'colonel', 1.0886122317272675),\n", - " (u'pitcher', 1.0886122317272675),\n", - " (u'spink', 1.0886122317272675),\n", - " (u'charter', 1.0886122317272673),\n", - " (u'clock', 1.0886122317272673),\n", - " (u'distribute', 1.0886122317272673),\n", - " (u'fisher', 1.0886122317272673),\n", - " (u'convention', 1.0842245322470756),\n", - " (u'plaintiff', 1.0813648643938589),\n", - " (u'island', 1.0791112857988696),\n", - " (u'voyage', 1.0772490318253176),\n", - " (u'you', 1.0716742799027257),\n", - " (u'road', 1.0587290524017576),\n", - " (u'holland', 1.05373524048365),\n", - " (u'trailor', 1.0479900750043671),\n", - " (u'limited', 1.0447190713617185),\n", - " (u'domain', 1.0399471126066209),\n", - " (u'grandfather', 1.0399471126066209),\n", - " (u'voted', 1.0399471126066209),\n", - " (u'agree', 1.0367857078081339),\n", - " (u'including', 1.0367857078081339),\n", - " (u'life', 1.0279778291629844),\n", - " (u'witness', 1.0249646422762066),\n", - " (u'james', 1.0153080476245506),\n", - " (u'stuart', 1.0149104889383316),\n", - " (u'dungee', 1.0102738780733427),\n", - " (u'john', 1.0074378828094916),\n", - " (u'surveyor', 1.0071083505332288),\n", - " (u'cross', 1.0008479040802145),\n", - " (u'dollars', 1.0002448365299736),\n", - " (u'president', 0.99828026284480487),\n", - " (u'_amount_', 0.99450922395310026),\n", - " (u'_black', 0.99450922395310026),\n", - " (u'_commercial', 0.99450922395310026),\n", - " (u'_magazine', 0.99450922395310026),\n", - " (u'_nicolay', 0.99450922395310026),\n", - " (u'_north', 0.99450922395310026),\n", - " (u'_sun_', 0.99450922395310026),\n", - " (u'accompanies', 0.99450922395310026),\n", - " (u'accordance', 0.99450922395310026),\n", - " (u'adjourning', 0.99450922395310026),\n", - " (u'advertiser', 0.99450922395310026),\n", - " (u'advertiser_', 0.99450922395310026),\n", - " (u'agnosticism', 0.99450922395310026),\n", - " (u'almanacs', 0.99450922395310026),\n", - " (u'animals', 0.99450922395310026),\n", - " (u'apparel', 0.99450922395310026),\n", - " (u'appoints', 0.99450922395310026),\n", - " (u'arbitrations', 0.99450922395310026),\n", - " (u'ascii', 0.99450922395310026),\n", - " (u'asks', 0.99450922395310026),\n", - " (u'aspirants', 0.99450922395310026),\n", - " (u'atrocious', 0.99450922395310026),\n", - " (u'attachment', 0.99450922395310026),\n", - " (u'authors', 0.99450922395310026),\n", - " (u'band', 0.99450922395310026),\n", - " (u'bargained', 0.99450922395310026),\n", - " (u'bets', 0.99450922395310026),\n", - " (u'bleeding', 0.99450922395310026),\n", - " (u'boats', 0.99450922395310026),\n", - " (u'book_', 0.99450922395310026),\n", - " (u'boss', 0.99450922395310026),\n", - " (u'bourgeois', 0.99450922395310026),\n", - " (u'bull', 0.99450922395310026),\n", - " (u'calf', 0.99450922395310026),\n", - " (u'chase', 0.99450922395310026),\n", - " (u'chicanery', 0.99450922395310026),\n", - " (u'coach', 0.99450922395310026),\n", - " (u'coins', 0.99450922395310026),\n", - " (u'comet', 0.99450922395310026),\n", - " (u'computer', 0.99450922395310026),\n", - " (u'computers', 0.99450922395310026),\n", - " (u'concentration', 0.99450922395310026),\n", - " (u'conquering', 0.99450922395310026),\n", - " (u'conservator', 0.99450922395310026),\n", - " (u'contentedly', 0.99450922395310026),\n", - " (u'copied', 0.99450922395310026),\n", - " (u'cord', 0.99450922395310026),\n", - " (u'cornell', 0.99450922395310026),\n", - " (u'countenance', 0.99450922395310026),\n", - " (u'counting', 0.99450922395310026),\n", - " (u'countryman', 0.99450922395310026),\n", - " (u'creeks', 0.99450922395310026),\n", - " (u'davy', 0.99450922395310026),\n", - " (u'deer', 0.99450922395310026),\n", - " (u'def', 0.99450922395310026),\n", - " (u'delegations', 0.99450922395310026),\n", - " (u'deliveries', 0.99450922395310026),\n", - " (u'demurrer', 0.99450922395310026),\n", - " (u'desires', 0.99450922395310026),\n", - " (u'detriment', 0.99450922395310026),\n", - " (u'directors', 0.99450922395310026),\n", - " (u'disallows', 0.99450922395310026),\n", - " (u'disgracing', 0.99450922395310026),\n", - " (u'doctoring', 0.99450922395310026),\n", - " (u'effectively', 0.99450922395310026),\n", - " (u'elections', 0.99450922395310026),\n", - " (u'electronically', 0.99450922395310026),\n", - " (u'enrolling', 0.99450922395310026),\n", - " (u'exempt', 0.99450922395310026),\n", - " (u'faded', 0.99450922395310026),\n", - " (u'fares', 0.99450922395310026),\n", - " (u'ff', 0.99450922395310026),\n", - " (u'fights', 0.99450922395310026),\n", - " (u'flatboat', 0.99450922395310026),\n", - " (u'founded', 0.99450922395310026),\n", - " (u'generals', 0.99450922395310026),\n", - " (u'goose', 0.99450922395310026),\n", - " (u'greed', 0.99450922395310026),\n", - " (u'groomsman', 0.99450922395310026),\n", - " (u'hagerty', 0.99450922395310026),\n", - " (u'hans', 0.99450922395310026),\n", - " (u'harvard', 0.99450922395310026),\n", - " (u'haute', 0.99450922395310026),\n", - " (u'heel', 0.99450922395310026),\n", - " (u'history_', 0.99450922395310026),\n", - " (u'homeliest', 0.99450922395310026),\n", - " (u'howard', 0.99450922395310026),\n", - " (u'hut', 0.99450922395310026),\n", - " (u'ice', 0.99450922395310026),\n", - " (u'ida', 0.99450922395310026),\n", - " (u'identical', 0.99450922395310026),\n", - " (u'imperialist', 0.99450922395310026),\n", - " (u'independent', 0.99450922395310026),\n", - " (u'invalid', 0.99450922395310026),\n", - " (u'irons', 0.99450922395310026),\n", - " (u'janet', 0.99450922395310026),\n", - " (u'justification', 0.99450922395310026),\n", - " (u'lamborn', 0.99450922395310026),\n", - " (u'lambs', 0.99450922395310026),\n", - " (u'larceny', 0.99450922395310026),\n", - " (u'latin', 0.99450922395310026),\n", - " (u'linen', 0.99450922395310026),\n", - " (u'locations', 0.99450922395310026),\n", - " (u'louder', 0.99450922395310026),\n", - " (u'mad', 0.99450922395310026),\n", - " (u'magruder', 0.99450922395310026),\n", - " (u'maid', 0.99450922395310026),\n", - " (u'metaphysical', 0.99450922395310026),\n", - " (u'mit', 0.99450922395310026),\n", - " (u'monthlies', 0.99450922395310026),\n", - " (u'nest', 0.99450922395310026),\n", - " (u'nigger', 0.99450922395310026),\n", - " (u'package', 0.99450922395310026),\n", - " (u'pan', 0.99450922395310026),\n", - " (u'parentage', 0.99450922395310026),\n", - " (u'partial', 0.99450922395310026),\n", - " (u'partly', 0.99450922395310026),\n", - " (u'passengers', 0.99450922395310026),\n", - " (u'pension', 0.99450922395310026),\n", - " (u'pl', 0.99450922395310026),\n", - " (u'playful', 0.99450922395310026),\n", - " (u'population', 0.99450922395310026),\n", - " (u'postponed', 0.99450922395310026),\n", - " (u'postponement', 0.99450922395310026),\n", - " (u'premise', 0.99450922395310026),\n", - " (u'pressure', 0.99450922395310026),\n", - " (u'presumption', 0.99450922395310026),\n", - " (u'preventing', 0.99450922395310026),\n", - " (u'quart', 0.99450922395310026),\n", - " (u'quincy', 0.99450922395310026),\n", - " (u'quorum', 0.99450922395310026),\n", - " (u'redistribution', 0.99450922395310026),\n", - " (u'rejoicing', 0.99450922395310026),\n", - " (u'remit', 0.99450922395310026),\n", - " (u'rifle', 0.99450922395310026),\n", - " (u'romance', 0.99450922395310026),\n", - " (u'rothschild_', 0.99450922395310026),\n", - " (u'row', 0.99450922395310026),\n", - " (u'rubbish', 0.99450922395310026),\n", - " (u'sacrifices', 0.99450922395310026),\n", - " (u'scroll', 0.99450922395310026),\n", - " (u'shade', 0.99450922395310026),\n", - " (u'shed', 0.99450922395310026),\n", - " (u'sigh', 0.99450922395310026),\n", - " (u'silk', 0.99450922395310026),\n", - " (u'sinewy', 0.99450922395310026),\n", - " (u'sock', 0.99450922395310026),\n", - " (u'solicit', 0.99450922395310026),\n", - " (u'solvent', 0.99450922395310026),\n", - " (u'sonny', 0.99450922395310026),\n", - " (u'startling', 0.99450922395310026),\n", - " (u'steals', 0.99450922395310026),\n", - " (u'steamer', 0.99450922395310026),\n", - " (u'stevenson', 0.99450922395310026),\n", - " (u'subp\\u0153naed', 0.99450922395310026),\n", - " (u'tanned', 0.99450922395310026),\n", - " (u'tea', 0.99450922395310026),\n", - " (u'terre', 0.99450922395310026),\n", - " (u'theosophy', 0.99450922395310026),\n", - " (u'tight', 0.99450922395310026),\n", - " (u'tis', 0.99450922395310026),\n", - " (u'tour', 0.99450922395310026),\n", - " (u'vanilla', 0.99450922395310026),\n", - " (u'vol', 0.99450922395310026),\n", - " (u'warfare', 0.99450922395310026),\n", - " (u'warranty', 0.99450922395310026),\n", - " (u'wayne', 0.99450922395310026),\n", - " (u'whip', 0.99450922395310026),\n", - " (u'woodcut', 0.99450922395310026),\n", - " (u'wright', 0.99450922395310026),\n", - " (u'new', 0.99212250974463601),\n", - " (u'swett', 0.98946460737046715),\n", - " (u'vote', 0.98946460737046715),\n", - " (u'argument', 0.98558697994489508),\n", - " (u'whig', 0.98356004695062138),\n", - " (u'bros', 0.97944759251401625),\n", - " (u'harper', 0.97944759251401625),\n", - " (u'jeremiah', 0.97944759251401625),\n", - " (u'metzker', 0.97944759251401625),\n", - " (u'clay', 0.96981219799970608),\n", - " (u'_the', 0.96570796937579129),\n", - " (u'h', 0.95765441503007809),\n", - " (u'benjamin', 0.94868978259839132),\n", - " (u'captain', 0.94868978259839132),\n", - " (u'crawford', 0.94558617483618734),\n", - " (u'web', 0.94558617483618734),\n", - " (u'cents', 0.93351879101119639),\n", - " (u'physical', 0.92813378536617597),\n", - " (u'alonzo', 0.92486723054286335),\n", - " (u'april', 0.92486723054286335),\n", - " (u'barrel', 0.92220463832946642),\n", - " (u'butler', 0.92220463832946642),\n", - " (u'cabinet', 0.92220463832946642),\n", - " (u'cincinnati', 0.92220463832946642),\n", - " (u'committees', 0.92220463832946642),\n", - " (u'containing', 0.92220463832946642),\n", - " (u'corner', 0.92220463832946642),\n", - " (u'daily', 0.92220463832946642),\n", - " (u'defective', 0.92220463832946642),\n", - " (u'greeley', 0.92220463832946642),\n", - " (u'inches', 0.92220463832946642),\n", - " (u'johnson', 0.92220463832946642),\n", - " (u'limitation', 0.92220463832946642),\n", - " (u'site', 0.92220463832946642),\n", - " (u'steamboat', 0.92220463832946642),\n", - " (u'suffice', 0.92220463832946642),\n", - " (u'trips', 0.92220463832946642),\n", - " (u'university', 0.92220463832946642),\n", - " (u'vulnerable', 0.92220463832946642),\n", - " (u'william', 0.92182669138259055),\n", - " (u'buys', 0.91921942236647736),\n", - " (u'mifflin', 0.91921942236647736),\n", - " (u'morris', 0.91921942236647736),\n", - " (u'refuses', 0.91921942236647736),\n", - " (u'sells', 0.91921942236647736),\n", - " (u'tom', 0.91921942236647736),\n", - " (u'widow', 0.91921942236647736),\n", - " (u'sold', 0.91058236515865065),\n", - " (u'gentry', 0.90964424191141902),\n", - " (u'sound', 0.90964424191141902),\n", - " (u'march', 0.90184519263849161),\n", - " (u'anecdotes', 0.89647288909731682),\n", - " (u'trading', 0.89300683234257683),\n", - " (u'lack', 0.87911128579886944),\n", - " (u'orleans', 0.87911128579886944),\n", - " (u'sketch', 0.87911128579886944),\n", - " (u'laws', 0.87819506388264168),\n", - " (u'court', 0.87301265479021239),\n", - " (u'fees', 0.87255211331288773),\n", - " (u'introduction', 0.86973028218589565),\n", - " (u'boy', 0.85605385441424486),\n", - " (u'reports', 0.85227169920605483),\n", - " (u'witnesses', 0.85227169920605483),\n", - " (u'b', 0.8493617351165339),\n", - " (u'abe', 0.84568544478024776),\n", - " (u'magazine_', 0.84481219799970608),\n", - " (u'henry', 0.84251319517689804),\n", - " (u'speed', 0.84212114395694027),\n", - " (u'anecdote', 0.83889967717920211),\n", - " (u'society', 0.83631295591025312),\n", - " (u'judgment', 0.83278478699937608),\n", - " (u'creek', 0.83070878764523437),\n", - " (u'slander', 0.82845569625516102),\n", - " (u'r', 0.82845569625516058),\n", - " (u'earn', 0.82576785278214482),\n", - " (u'client', 0.82347269328712169),\n", - " (u'case', 0.81966768916575017),\n", - " (u'menard', 0.81137835062883923),\n", - " (u'ohio', 0.81137835062883923),\n", - " (u'w', 0.80428968429945247),\n", - " (u'married', 0.80362253558252261),\n", - " (u'write', 0.80362253558252261),\n", - " (u'study', 0.79632966848691966),\n", - " (u'_', 0.79325323203556808),\n", - " (u'party', 0.79159811445620765),\n", - " (u'contained', 0.78994711260662109),\n", - " (u'handling', 0.78994711260662109),\n", - " (u'judd', 0.78994711260662109),\n", - " (u'obscure', 0.78994711260662109),\n", - " (u'pratt', 0.78994711260662109),\n", - " (u'rolling', 0.78994711260662109),\n", - " (u'van', 0.78994711260662109),\n", - " (u'hay', 0.78877059308059705),\n", - " (u'care', 0.78490422698683959),\n", - " (u'supreme', 0.78462528298567236),\n", - " (u'letter', 0.77973386004105638),\n", - " (u'me', 0.77433431986501056),\n", - " (u'money', 0.771981872795795),\n", - " (u'circular', 0.771227137896773),\n", - " (u'contributions', 0.771227137896773),\n", - " (u'elizabethtown', 0.771227137896773),\n", - " (u'fails', 0.771227137896773),\n", - " (u'knows', 0.771227137896773),\n", - " (u'located', 0.771227137896773),\n", - " (u'pennsylvania', 0.771227137896773),\n", - " (u'pledged', 0.771227137896773),\n", - " (u'providing', 0.771227137896773),\n", - " (u'sentence', 0.771227137896773),\n", - " (u'shirt', 0.771227137896773),\n", - " (u'size', 0.771227137896773),\n", - " (u'sleeve', 0.771227137896773),\n", - " (u'jury', 0.77018773870139601),\n", - " (u'justice', 0.76953789330855393),\n", - " (u'stories', 0.76708395277153052),\n", - " (u'political', 0.76430322684478202),\n", - " (u'mrs', 0.76262458115983867),\n", - " (u'gentlemen', 0.76084736419494403),\n", - " (u'plea', 0.76084736419494403),\n", - " (u'honest', 0.7601545702334116),\n", - " (u'archibald', 0.75527889839393425),\n", - " (u'argued', 0.75527889839393425),\n", - " (u'coin', 0.75527889839393425),\n", - " (u'creditor', 0.75527889839393425),\n", - " (u'endorses', 0.75527889839393425),\n", - " (u'floyd', 0.75527889839393425),\n", - " (u'fraud', 0.75527889839393425),\n", - " (u'frederick', 0.75527889839393425),\n", - " (u'hale', 0.75527889839393425),\n", - " (u'heels', 0.75527889839393425),\n", - " (u'legally', 0.75527889839393425),\n", - " (u'maker', 0.75527889839393425),\n", - " (u'mchenry', 0.75527889839393425),\n", - " (u'memorial', 0.75527889839393425),\n", - " (u'partisan', 0.75527889839393425),\n", - " (u'personally', 0.75527889839393425),\n", - " (u'race', 0.75527889839393425),\n", - " (u'stevens', 0.75527889839393425),\n", - " (u'submit', 0.75527889839393425),\n", - " (u'admitted', 0.75527889839393403),\n", - " (u'file', 0.75527889839393403),\n", - " (u'hardships', 0.75527889839393403),\n", - " (u'music', 0.75527889839393403),\n", - " (u'refusal', 0.75527889839393403),\n", - " (u'sangamon', 0.75131617260654426),\n", - " (u'january', 0.75014968948543359),\n", - " (u'hill', 0.74969894675964133),\n", - " (u'truth', 0.74926914745020134),\n", - " (u'hon', 0.74391569849198458),\n", - " (u'douglas', 0.74391569849198413),\n", - " (u'earned', 0.74391569849198413),\n", - " (u'guilty', 0.74391569849198413),\n", - " (u'campaign', 0.73772110820519954),\n", - " (u'log', 0.7346246403825254),\n", - " (u'coffin', 0.73455036572464039),\n", - " (u'arnold', 0.72909340011582602),\n", - " (u'expenses', 0.72909340011582602),\n", - " (u'suit', 0.72889466428426264),\n", - " (u'sir', 0.72813378536617623),\n", - " (u'damages', 0.72813378536617579),\n", - " (u'george', 0.72713812318435789),\n", - " (u'owner', 0.72605378988157332),\n", - " (u'poverty', 0.71981219799970608),\n", - " (u'representatives', 0.71981219799970608),\n", - " (u'river', 0.71959767653412277),\n", - " (u'gridley', 0.71526365099729405),\n", - " (u'letters', 0.71513361836592892),\n", - " (u'rock', 0.71513361836592892),\n", - " (u'use', 0.71454647320175191),\n", - " (u'washington', 0.71171778456659318),\n", - " (u'central', 0.7099478124556069),\n", - " (u'election', 0.70657488299084736),\n", - " (u'tell', 0.70626746228172177),\n", - " (u'salem', 0.70613716823020756),\n", - " (u'section', 0.70306361551922913),\n", - " (u'radford', 0.69558617483618734),\n", - " (u'www', 0.69558617483618734),\n", - " (u'congress', 0.69556362076674905),\n", - " (u'candidate', 0.69184503181253953),\n", - " (u'hundred', 0.69164202681964948),\n", - " (u'counterfeit', 0.68742201968919669),\n", - " (u'ethical', 0.68742201968919669),\n", - " (u'hoblit', 0.68742201968919669),\n", - " (u'jones', 0.68742201968919669),\n", - " (u'silver', 0.68742201968919669),\n", - " (u'land', 0.68722988386024664),\n", - " (u'committee', 0.68422453224707569),\n", - " (u'logan', 0.68405655526716558),\n", - " (u'democrats', 0.67765948196948589),\n", - " (u'see', 0.67486874148436105),\n", - " (u'integrity', 0.67102513071863612),\n", - " (u'debt', 0.66934835459409747),\n", - " (u'session', 0.66560926895684913),\n", - " (u'paper', 0.65789112937712302),\n", - " (u'american', 0.65765491292160583),\n", - " (u'_atlantic', 0.65398232285618652),\n", - " (u'adjournment', 0.65398232285618652),\n", - " (u'advertising', 0.65398232285618652),\n", - " (u'affidavit', 0.65398232285618652),\n", - " (u'afraid', 0.65398232285618652),\n", - " (u'albany', 0.65398232285618652),\n", - " (u'altered', 0.65398232285618652),\n", - " (u'anti', 0.65398232285618652),\n", - " (u'armed', 0.65398232285618652),\n", - " (u'aspiration', 0.65398232285618652),\n", - " (u'ate', 0.65398232285618652),\n", - " (u'athens', 0.65398232285618652),\n", - " (u'baron', 0.65398232285618652),\n", - " (u'borrows', 0.65398232285618652),\n", - " (u'breaking', 0.65398232285618652),\n", - " (u'car', 0.65398232285618652),\n", - " (u'cargo', 0.65398232285618652),\n", - " (u'cedar', 0.65398232285618652),\n", - " (u'cheap', 0.65398232285618652),\n", - " (u'chew', 0.65398232285618652),\n", - " (u'clarke', 0.65398232285618652),\n", - " (u'coles', 0.65398232285618652),\n", - " (u'combat', 0.65398232285618652),\n", - " (u'constitution', 0.65398232285618652),\n", - " (u'crippled', 0.65398232285618652),\n", - " (u'customer', 0.65398232285618652),\n", - " (u'dared', 0.65398232285618652),\n", - " (u'darkness', 0.65398232285618652),\n", - " (u'deftly', 0.65398232285618652),\n", - " (u'derivative', 0.65398232285618652),\n", - " (u'doubleday', 0.65398232285618652),\n", - " (u'dresser', 0.65398232285618652),\n", - " (u'eleanor', 0.65398232285618652),\n", - " (u'emulation', 0.65398232285618652),\n", - " (u'entity', 0.65398232285618652),\n", - " (u'equipment', 0.65398232285618652),\n", - " (u'estimation', 0.65398232285618652),\n", - " (u'exceptional', 0.65398232285618652),\n", - " (u'exert', 0.65398232285618652),\n", - " (u'expedition', 0.65398232285618652),\n", - " (u'fortified', 0.65398232285618652),\n", - " (u'frail', 0.65398232285618652),\n", - " (u'francis', 0.65398232285618652),\n", - " (u'fun', 0.65398232285618652),\n", - " (u'generations', 0.65398232285618652),\n", - " (u'gov', 0.65398232285618652),\n", - " (u'grip', 0.65398232285618652),\n", - " (u'grudge', 0.65398232285618652),\n", - " (u'hartford', 0.65398232285618652),\n", - " (u'hawk_', 0.65398232285618652),\n", - " (u'heirs', 0.65398232285618652),\n", - " (u'hicks', 0.65398232285618652),\n", - " (u'hit', 0.65398232285618652),\n", - " (u'hog', 0.65398232285618652),\n", - " (u'ignorant', 0.65398232285618652),\n", - " (u'instinct', 0.65398232285618652),\n", - " (u'irwin', 0.65398232285618652),\n", - " (u'kankakee', 0.65398232285618652),\n", - " (u'keckley', 0.65398232285618652),\n", - " (u'ketcham', 0.65398232285618652),\n", - " (u'killed', 0.65398232285618652),\n", - " (u'kingsbury', 0.65398232285618652),\n", - " (u'lasted', 0.65398232285618652),\n", - " (u'licensed', 0.65398232285618652),\n", - " (u'links', 0.65398232285618652),\n", - " (u'lippincott', 0.65398232285618652),\n", - " (u'load', 0.65398232285618652),\n", - " (u'mania', 0.65398232285618652),\n", - " (u'marries', 0.65398232285618652),\n", - " (u'mcclurg', 0.65398232285618652),\n", - " (u'md', 0.65398232285618652),\n", - " (u'mile', 0.65398232285618652),\n", - " (u'miller', 0.65398232285618652),\n", - " (u'monthly_', 0.65398232285618652),\n", - " (u'morally', 0.65398232285618652),\n", - " (u'necessity', 0.65398232285618652),\n", - " (u'norris', 0.65398232285618652),\n", - " (u'norton', 0.65398232285618652),\n", - " (u'opening', 0.65398232285618652),\n", - " (u'orr', 0.65398232285618652),\n", - " (u'ossian', 0.65398232285618652),\n", - " (u'overheard', 0.65398232285618652),\n", - " (u'parent', 0.65398232285618652),\n", - " (u'peachy', 0.65398232285618652),\n", - " (u'petition', 0.65398232285618652),\n", - " (u'pinching', 0.65398232285618652),\n", - " (u'plunged', 0.65398232285618652),\n", - " (u'plutarch', 0.65398232285618652),\n", - " (u'policies', 0.65398232285618652),\n", - " (u'pound', 0.65398232285618652),\n", - " (u'presenting', 0.65398232285618652),\n", - " (u'press_', 0.65398232285618652),\n", - " (u'primm', 0.65398232285618652),\n", - " (u'procured', 0.65398232285618652),\n", - " (u'profaned', 0.65398232285618652),\n", - " (u'prominently', 0.65398232285618652),\n", - " (u'pushing', 0.65398232285618652),\n", - " (u'qualified', 0.65398232285618652),\n", - " (u'questionable', 0.65398232285618652),\n", - " (u'ramsay', 0.65398232285618652),\n", - " (u'rapids', 0.65398232285618652),\n", - " (u'recollection', 0.65398232285618652),\n", - " (u'religion', 0.65398232285618652),\n", - " (u'reluctantly', 0.65398232285618652),\n", - " (u'render', 0.65398232285618652),\n", - " (u'repeal', 0.65398232285618652),\n", - " (u'represent', 0.65398232285618652),\n", - " (u'resourcefulness', 0.65398232285618652),\n", - " (u'responsible', 0.65398232285618652),\n", - " (u'retaining', 0.65398232285618652),\n", - " (u'review_', 0.65398232285618652),\n", - " (u'revolution', 0.65398232285618652),\n", - " (u'ridiculous', 0.65398232285618652),\n", - " (u'rightful', 0.65398232285618652),\n", - " (u'ruled', 0.65398232285618652),\n", - " (u'rural', 0.65398232285618652),\n", - " (u'satisfactory', 0.65398232285618652),\n", - " (u'sayings', 0.65398232285618652),\n", - " (u'scarcely', 0.65398232285618652),\n", - " (u'score', 0.65398232285618652),\n", - " (u'sheep', 0.65398232285618652),\n", - " (u'shuffling', 0.65398232285618652),\n", - " (u'sleeves', 0.65398232285618652),\n", - " (u'sources', 0.65398232285618652),\n", - " (u'street', 0.65398232285618652),\n", - " (u'sues', 0.65398232285618652),\n", - " (u'suing', 0.65398232285618652),\n", - " (u'sumner', 0.65398232285618652),\n", - " (u'sundry', 0.65398232285618652),\n", - " (u'suspicious', 0.65398232285618652),\n", - " (u'taylor', 0.65398232285618652),\n", - " (u'torch', 0.65398232285618652),\n", - " (u'trent', 0.65398232285618652),\n", - " (u'umbrella', 0.65398232285618652),\n", - " (u'unassuming', 0.65398232285618652),\n", - " (u'ungainly', 0.65398232285618652),\n", - " (u'user', 0.65398232285618652),\n", - " (u'victim', 0.65398232285618652),\n", - " (u'vs', 0.65398232285618652),\n", - " (u'wildcat', 0.65398232285618652),\n", - " (u'xxxvii', 0.65398232285618652),\n", - " (u'opponent', 0.65154286586887178),\n", - " (u'trial', 0.64758856417750366),\n", - " (u'edition', 0.64663751443697892),\n", - " (u'onstot', 0.64663751443697892),\n", - " (u'robert', 0.64050211515912991),\n", - " (u'october', 0.6401563236722736),\n", - " (u'also', 0.6385436730575007),\n", - " (u'davis', 0.63634793477555185),\n", - " (u'additional', 0.63350513665219177),\n", - " (u'enemies', 0.63350513665219177),\n", - " (u'pleas', 0.63350513665219177),\n", - " (u'provided', 0.63350513665219177),\n", - " (u'rev', 0.63350513665219177),\n", - " (u'stranger', 0.63350513665219177),\n", - " (u'_versus_', 0.63350513665219133),\n", - " (u'allen', 0.63350513665219133),\n", - " (u'brockett', 0.63350513665219133),\n", - " (u'editorial', 0.63350513665219133),\n", - " (u'emerson', 0.63350513665219133),\n", - " (u'enlarged', 0.63350513665219133),\n", - " (u'manuscript', 0.63350513665219133),\n", - " (u'massachusetts', 0.63350513665219133),\n", - " (u'patterson', 0.63350513665219133),\n", - " (u'raymond', 0.63350513665219133),\n", - " (u'smoot', 0.63350513665219133),\n", - " (u'weekly_', 0.63350513665219133),\n", - " (u'states', 0.63133846043671316),\n", - " (u'l', 0.62992122927427241),\n", - " (u'advised', 0.62946431470002273),\n", - " (u'agent', 0.62946431470002273),\n", - " (u'bunn', 0.62946431470002273),\n", - " (u'controversy', 0.62946431470002273),\n", - " (u'josiah', 0.62946431470002273),\n", - " (u'legislative', 0.62946431470002273),\n", - " (u'online', 0.62946431470002273),\n", - " (u'pigeon', 0.62946431470002273),\n", - " (u'spencer', 0.62946431470002273),\n", - " (u'adjourned', 0.62946431470002251),\n", - " (u'inside', 0.62946431470002251),\n", - " (u'jesse', 0.62946431470002251),\n", - " (u'jurymen', 0.62946431470002251),\n", - " (u'million', 0.62946431470002251),\n", - " (u'moon', 0.62946431470002251),\n", - " (u'preacher', 0.62946431470002251),\n", - " (u'using', 0.62946431470002251),\n", - " (u'or', 0.62843784044209627),\n", - " (u'litigation', 0.62545126022927233),\n", - " (u'newspaper', 0.62545126022927233),\n", - " (u'defendant', 0.62290734657589919),\n", - " (u'my', 0.62258846505370258),\n", - " (u'associated', 0.62098413888097026),\n", - " (u'counsel', 0.61859284602665277),\n", - " (u'toward', 0.61454730339646257),\n", - " (u'history', 0.61124124826162474),\n", - " (u'quoted', 0.6084865654230116),\n", - " (u'issue', 0.60729254662829124),\n", - " (u'dollar', 0.60729254662829035),\n", - " (u'major', 0.60362253558252288),\n", - " (u'prairie', 0.60362253558252288),\n", - " (u'experiences', 0.60362253558252243),\n", - " (u'legislation', 0.60362253558252243),\n", - " (u'_vs', 0.6035456305599225),\n", - " (u'historical', 0.6035456305599225),\n", - " (u'produced', 0.6035456305599225),\n", - " (u'bergen', 0.60354563055992205),\n", - " (u'sheriff', 0.60354563055992205),\n", - " (u'springfield', 0.60331495800961132),\n", - " (u'i', 0.60133778763850376),\n", - " (u'horse', 0.60121703239130708),\n", - " (u'century', 0.60105378988157332),\n", - " (u'clients', 0.59743816921816073),\n", - " (u'goes', 0.59710003352803476),\n", - " (u'charge', 0.59649450823617922),\n", - " (u'murder', 0.59481219799970608),\n", - " (u'plain', 0.58605571895428143),\n", - " (u'us', 0.58221475738787731),\n", - " (u'town', 0.58012788024267969),\n", - " (u'house', 0.57887700145220933),\n", - " (u'indiana', 0.57824649026730324),\n", - " (u'politicians', 0.57824649026730324),\n", - " (u'evidence', 0.5773563125339356),\n", - " (u'united', 0.5773563125339356),\n", - " (u'_lincoln_', 0.57724903182531806),\n", - " (u'dear', 0.57724903182531762),\n", - " (u'we', 0.57676182641453266),\n", - " (u'based', 0.57524724626967227),\n", - " (u'farmer', 0.57524724626967227),\n", - " (u'statute', 0.57524724626967227),\n", - " (u'your', 0.57484862840063666),\n", - " (u'm', 0.56962689226253538),\n", - " (u'testimony', 0.56654837085837473),\n", - " (u't', 0.56572841285799047),\n", - " (u'why', 0.56025440339254207),\n", - " (u'judge', 0.55838252725925042),\n", - " (u'questions', 0.55586577584685459),\n", - " (u'politician', 0.55230686834859721),\n", - " (u'poor', 0.55230686834859721),\n", - " (u'mr', 0.55178896100960007),\n", - " (u'illustrated', 0.55089119891374239),\n", - " (u'south', 0.55089119891374239),\n", - " (u'bad', 0.55041258498620138),\n", - " (u'wrong', 0.54883327250182479),\n", - " (u'attorneys', 0.54775614656095595),\n", - " (u'sure', 0.54763172911448921),\n", - " (u'nomination', 0.5437356699215532),\n", - " (u'coat', 0.53994711260662109),\n", - " (u'corporation', 0.53994711260662109),\n", - " (u'hapgood', 0.53994711260662109),\n", - " (u'larger', 0.53994711260662109),\n", - " (u'match', 0.53994711260662109),\n", - " (u'matteson', 0.53994711260662109),\n", - " (u'requirements', 0.53994711260662109),\n", - " (u'team', 0.53994711260662109),\n", - " (u'understanding', 0.53994711260662109),\n", - " (u'speech', 0.53442414980620789),\n", - " (u'carpenter', 0.5334454691791124),\n", - " (u'dennis', 0.5334454691791124),\n", - " (u'funds', 0.5334454691791124),\n", - " (u'papers', 0.5334454691791124),\n", - " (u'since', 0.5334454691791124),\n", - " (u'stage', 0.5334454691791124),\n", - " (u'jackson', 0.53344546917911195),\n", - " (u'duff', 0.52813378536617561),\n", - " (u'hat', 0.52813378536617561),\n", - " (u'wants', 0.52813378536617561),\n", - " (u'services', 0.52723560020577054),\n", - " (u'_mcclure', 0.5256640649145532),\n", - " (u'borrowed', 0.5256640649145532),\n", - " (u'bush', 0.5256640649145532),\n", - " (u'collect', 0.5256640649145532),\n", - " (u'demand', 0.5256640649145532),\n", - " (u'examined', 0.5256640649145532),\n", - " (u'majority', 0.5256640649145532),\n", - " (u'marshall', 0.5256640649145532),\n", - " (u'mean', 0.5256640649145532),\n", - " (u'n', 0.5256640649145532),\n", - " (u'partners', 0.5256640649145532),\n", - " (u'thompson', 0.5256640649145532),\n", - " (u'voting', 0.5256640649145532),\n", - " (u'worn', 0.5256640649145532),\n", - " (u'night', 0.52052348222604294),\n", - " (u'prove', 0.5183559396399664),\n", - " (u'records', 0.5183559396399664),\n", - " (u'woman', 0.5183559396399664),\n", - " (u'honesty', 0.51731221687018003),\n", - " (u'cent', 0.51717897687248238),\n", - " (u'curtis', 0.51717897687248238),\n", - " (u'daniel', 0.51717897687248238),\n", - " (u'examination', 0.51717897687248238),\n", - " (u'harris', 0.51717897687248238),\n", - " (u'st', 0.51717897687248238),\n", - " (u'o', 0.51599086537547656),\n", - " (u'white', 0.51014351178890438),\n", - " (u'law', 0.5078144592914029),\n", - " (u'our', 0.5025203678086001),\n", - " (u'cast', 0.50045126022927233),\n", - " (u'thousand', 0.50045126022927233),\n", - " (u'assembly', 0.49933052273546652),\n", - " (u'dr', 0.49933052273546652),\n", - " (u'recollections', 0.49933052273546652),\n", - " (u'whigs', 0.49933052273546652),\n", - " (u'capital', 0.49847712589366155),\n", - " (u'black', 0.49381656235726812),\n", - " (u'attorney', 0.49089323167751253),\n", - " (u'verdict', 0.48882938154224975),\n", - " (u'home', 0.4875980689579178),\n", - " (u'cause', 0.48750152519474277),\n", - " (u'_boy', 0.47944759251401625),\n", - " (u'_ibid', 0.47944759251401625),\n", - " (u'_times_', 0.47944759251401625),\n", - " (u'_woman', 0.47944759251401625),\n", - " (u'angel', 0.47944759251401625),\n", - " (u'anticipated', 0.47944759251401625),\n", - " (u'anxiety', 0.47944759251401625),\n", - " (u'argue', 0.47944759251401625),\n", - " (u'arguments', 0.47944759251401625),\n", - " (u'atlantic', 0.47944759251401625),\n", - " (u'average', 0.47944759251401625),\n", - " (u'aversion', 0.47944759251401625),\n", - " (u'bankruptcy', 0.47944759251401625),\n", - " (u'beale', 0.47944759251401625),\n", - " (u'blackwell', 0.47944759251401625),\n", - " (u'breach', 0.47944759251401625),\n", - " (u'brougham', 0.47944759251401625),\n", - " (u'browning', 0.47944759251401625),\n", - " (u'careful', 0.47944759251401625),\n", - " (u'celebrated', 0.47944759251401625),\n", - " (u'civic', 0.47944759251401625),\n", - " (u'cloak', 0.47944759251401625),\n", - " (u'club', 0.47944759251401625),\n", - " (u'companion_', 0.47944759251401625),\n", - " (u'conception', 0.47944759251401625),\n", - " (u'conor', 0.47944759251401625),\n", - " (u'constituents', 0.47944759251401625),\n", - " (u'convincing', 0.47944759251401625),\n", - " (u'differed', 0.47944759251401625),\n", - " (u'discredited', 0.47944759251401625),\n", - " (u'dispute', 0.47944759251401625),\n", - " (u'dissolution', 0.47944759251401625),\n", - " (u'double', 0.47944759251401625),\n", - " (u'draft', 0.47944759251401625),\n", - " (u'employer', 0.47944759251401625),\n", - " (u'essay', 0.47944759251401625),\n", - " (u'etc', 0.47944759251401625),\n", - " (u'exercised', 0.47944759251401625),\n", - " (u'existed', 0.47944759251401625),\n", - " (u'farming', 0.47944759251401625),\n", - " (u'fellows', 0.47944759251401625),\n", - " (u'forge', 0.47944759251401625),\n", - " (u'fuller', 0.47944759251401625),\n", - " (u'gen', 0.47944759251401625),\n", - " (u'generation', 0.47944759251401625),\n", - " (u'giant', 0.47944759251401625),\n", - " (u'girl', 0.47944759251401625),\n", - " (u'glance', 0.47944759251401625),\n", - " (u'graham', 0.47944759251401625),\n", - " (u'grand', 0.47944759251401625),\n", - " (u'hall', 0.47944759251401625),\n", - " (u'hammond', 0.47944759251401625),\n", - " (u'handkerchief', 0.47944759251401625),\n", - " (u'hannah', 0.47944759251401625),\n", - " (u'happens', 0.47944759251401625),\n", - " (u'harding', 0.47944759251401625),\n", - " ...]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mz_keywords(text,scores=True,weighted=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When this option is used, it's possible to calculate a threshold automatically from the number of blocks. This is likely to be most useful when the number of blocks is fairly small (<10)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[(u'gutenberg', 3.7766363961259684),\n", - " (u'tm', 3.6403066998316511),\n", - " (u'project', 3.5428530523255342),\n", - " (u'co', 3.2983688146004528),\n", - " (u'donations', 2.8613536046553563),\n", - " (u'electronic', 2.8210861922674084),\n", - " (u'access', 2.7810662866642568),\n", - " (u'refund', 2.7810662866642568),\n", - " (u'foundation', 2.7234464816769872),\n", - " (u'foxboro', 2.5477601487545121),\n", - " (u'gloves', 2.5281337853661761),\n", - " (u'e', 2.4036269322210768),\n", - " (u'york', 2.3692008259770594),\n", - " (u'edited', 2.361641829495754),\n", - " (u'_works_', 2.3445174072327686),\n", - " (u'works', 2.3426500474551113),\n", - " (u'dogskin', 2.3425994588269479),\n", - " (u'ragsdale', 2.2931552327841351),\n", - " (u'replacement', 2.2931552327841351),\n", - " (u'trunks', 2.2931552327841351),\n", - " (u'iv', 2.2510299269025058),\n", - " (u'iii', 2.2186807817292546),\n", - " (u'v', 2.2168420707754368),\n", - " (u'brokaw', 2.1699176369612583),\n", - " (u'coon', 2.1699176369612583),\n", - " (u'bonds', 2.1343080503770544),\n", - " (u'license', 2.1009287665795293),\n", - " (u'ii', 2.0892470886183649),\n", - " (u'agreement', 2.0779209847210556),\n", - " (u'almanac', 2.0060727272918055),\n", - " (u'_weekly_', 1.9794475925140163),\n", - " (u'bounded', 1.9794475925140163),\n", - " (u'format', 1.9794475925140163),\n", - " (u'millions', 1.9794475925140163),\n", - " (u'oxen', 1.9794475925140163),\n", - " (u'specie', 1.9794475925140163),\n", - " (u'archive', 1.9682995275030786),\n", - " (u'barrett', 1.9422319940872796),\n", - " (u'reminiscences', 1.9330537427622287),\n", - " (u'ebooks', 1.8984698469769548),\n", - " (u'forquer', 1.8843080503770544),\n", - " (u'parker', 1.8843080503770544),\n", - " (u'pglaf', 1.8843080503770544),\n", - " (u'ebook', 1.8838775575675983),\n", - " (u'trademark', 1.8838775575675983),\n", - " (u'paragraph', 1.8301079379685583),\n", - " (u'hardin', 1.7669683658081703),\n", - " (u'work', 1.7328354724344326),\n", - " (u'rothschild', 1.7275730939964973),\n", - " (u'org', 1.7211393195188851),\n", - " (u'attitude', 1.716230650790012),\n", - " (u'london', 1.6791112857988695),\n", - " (u'boston', 1.6754810009833907),\n", - " (u'xvi', 1.66018729770736),\n", - " (u'news', 1.6601872977073597),\n", - " (u'biographical', 1.6294643147000225),\n", - " (u'green', 1.6254512602292723),\n", - " (u'delegates', 1.6127555612626692),\n", - " (u'medium', 1.6127555612626692),\n", - " (u'scripps', 1.6127555612626692),\n", - " (u'volunteers', 1.6127555612626692),\n", - " (u'lamon', 1.6001560607245646),\n", - " (u'tarbell', 1.5897346234235084),\n", - " (u'volumes', 1.5819481863246514),\n", - " (u'bank', 1.5744728128489647),\n", - " (u'copyright', 1.5731550611734115),\n", - " (u'_via_', 1.5722781569106761),\n", - " (u'admissibility', 1.5722781569106761),\n", - " (u'advertisers', 1.5722781569106761),\n", - " (u'applicable', 1.5722781569106761),\n", - " (u'attire', 1.5722781569106761),\n", - " (u'bags', 1.5722781569106761),\n", - " (u'berries', 1.5722781569106761),\n", - " (u'breeches', 1.5722781569106761),\n", - " (u'cline', 1.5722781569106761),\n", - " (u'continuance', 1.5722781569106761),\n", - " (u'currents', 1.5722781569106761),\n", - " (u'daguerreotype', 1.5722781569106761),\n", - " (u'disclaimer', 1.5722781569106761),\n", - " (u'email', 1.5722781569106761),\n", - " (u'enrolled', 1.5722781569106761),\n", - " (u'fool', 1.5722781569106761),\n", - " (u'guineas', 1.5722781569106761),\n", - " (u'hatchet', 1.5722781569106761),\n", - " (u'instruct', 1.5722781569106761),\n", - " (u'liability', 1.5722781569106761),\n", - " (u'lonny', 1.5722781569106761),\n", - " (u'paullin', 1.5722781569106761),\n", - " (u'performing', 1.5722781569106761),\n", - " (u'plow', 1.5722781569106761),\n", - " (u'polite', 1.5722781569106761),\n", - " (u'puffs', 1.5722781569106761),\n", - " (u'rulings', 1.5722781569106761),\n", - " (u'scammon', 1.5722781569106761),\n", - " (u'tilda', 1.5722781569106761),\n", - " (u'wake', 1.5722781569106761),\n", - " (u'warranties', 1.5722781569106761),\n", - " (u'america', 1.5712271378967728),\n", - " (u'clair', 1.5712271378967728),\n", - " (u'displaying', 1.5712271378967728),\n", - " (u'forgery', 1.5712271378967728),\n", - " (u'holder', 1.5712271378967728),\n", - " (u'posted', 1.5712271378967728),\n", - " (u'sketches', 1.5712271378967728),\n", - " (u'snow', 1.5712271378967728),\n", - " (u'wore', 1.5712271378967728),\n", - " (u'http', 1.5645865830262038),\n", - " (u'journalism', 1.5399471126066209),\n", - " (u'copy', 1.5258495075146912),\n", - " (u'_early', 1.5202411939312348),\n", - " (u'armstrong', 1.5106440743450187),\n", - " (u'railroad', 1.4938165623572677),\n", - " (u'ross', 1.489097832809857),\n", - " (u'pair', 1.4791112857988695),\n", - " (u'banks', 1.4791112857988693),\n", - " (u'irelan', 1.4791112857988693),\n", - " (u'scott', 1.4791112857988693),\n", - " (u'browne', 1.4764336408243595),\n", - " (u'abraham', 1.4577679329151634),\n", - " (u'publication', 1.4490612388306794),\n", - " (u'provide', 1.4490612388306792),\n", - " (u'chiniquy', 1.4275140308616106),\n", - " (u'literary', 1.4150354420715021),\n", - " (u'rr', 1.4070491486733681),\n", - " (u'axe', 1.3967912341407889),\n", - " (u'fence', 1.3967912341407889),\n", - " (u'genuine', 1.3967912341407889),\n", - " (u'life_', 1.3941370904272503),\n", - " (u'she', 1.3923582867044937),\n", - " (u'copper', 1.3828069220574104),\n", - " (u'distributing', 1.3828069220574104),\n", - " (u'saddle', 1.3828069220574104),\n", - " (u'sons', 1.3828069220574104),\n", - " (u'_life_', 1.373910241709706),\n", - " (u'calhoun', 1.373910241709706),\n", - " (u'mother', 1.3728688332198922),\n", - " (u'college', 1.3697302821858961),\n", - " (u'nicolay', 1.3633245760231363),\n", - " (u'whitney', 1.3627575629840512),\n", - " (u'philadelphia', 1.3540886863558637),\n", - " (u'sarah', 1.3540886863558634),\n", - " (u'vi', 1.3540886863558634),\n", - " (u'harrison', 1.3476159735283106),\n", - " (u'terms', 1.3426509824683515),\n", - " (u'herndon', 1.3421892681433798),\n", - " (u'improvement', 1.329344333012155),\n", - " (u'buckskin', 1.3222046383294666),\n", - " (u'sham', 1.3222046383294666),\n", - " (u'fee', 1.3158554460066139),\n", - " (u'generosity', 1.3144503596878891),\n", - " (u'moore', 1.3144503596878887),\n", - " (u'copies', 1.3127747798184011),\n", - " (u'p', 1.309088202039181),\n", - " (u'compliance', 1.2961309813666892),\n", - " (u'constable', 1.2961309813666892),\n", - " (u'currency', 1.2961309813666892),\n", - " (u'distribution', 1.2961309813666892),\n", - " (u'harvey', 1.2961309813666892),\n", - " (u'individual', 1.2961309813666892),\n", - " (u'revolutionary', 1.2961309813666892),\n", - " (u'brooks', 1.286562189794501),\n", - " (u'chicago', 1.2700186510810929),\n", - " (u'weems', 1.2659709073661847),\n", - " (u'february', 1.2574199029295277),\n", - " (u'information', 1.2487001310514776),\n", - " (u'bridge', 1.2326416539256813),\n", - " (u'resolution', 1.2268390166084573),\n", - " (u'stoddard', 1.2268390166084573),\n", - " (u'father', 1.2254034208363418),\n", - " (u'cartwright', 1.2157428532629155),\n", - " (u'houghton', 1.2157428532629155),\n", - " (u'publishing', 1.2157428532629155),\n", - " (u'describes', 1.2157428532629153),\n", - " (u'j', 1.2115310804189017),\n", - " (u'_stories_', 1.2049337080807629),\n", - " (u'september', 1.2030636155192291),\n", - " (u'boys', 1.1974364414369618),\n", - " (u'defendants', 1.1955861748361873),\n", - " (u'per', 1.1955861748361873),\n", - " (u'permission', 1.1955861748361873),\n", - " (u'uncle', 1.1955861748361873),\n", - " (u'thomas', 1.1924565577943991),\n", - " (u'trade', 1.1918333507609624),\n", - " (u'f', 1.1915163381561049),\n", - " (u'store', 1.189052998865439),\n", - " (u'notes', 1.1850922942502753),\n", - " (u'baker', 1.1828856976412236),\n", - " (u'baddeley', 1.1681694680548835),\n", - " (u'cogdal', 1.1681694680548835),\n", - " (u'copying', 1.1681694680548835),\n", - " (u'crafton', 1.1681694680548835),\n", - " (u'defect', 1.1681694680548835),\n", - " (u'donate', 1.1681694680548835),\n", - " (u'easier', 1.1681694680548835),\n", - " (u'editions', 1.1681694680548835),\n", - " (u'hawley', 1.1681694680548835),\n", - " (u'hitchcock', 1.1681694680548835),\n", - " (u'jake', 1.1681694680548835),\n", - " (u'jewelry', 1.1681694680548835),\n", - " (u'jurors', 1.1681694680548835),\n", - " (u'lightning', 1.1681694680548835),\n", - " (u'machine', 1.1681694680548835),\n", - " (u'paragraphs', 1.1681694680548835),\n", - " (u'pg', 1.1681694680548835),\n", - " (u'pork', 1.1681694680548835),\n", - " (u'retains', 1.1681694680548835),\n", - " (u'rod', 1.1681694680548835),\n", - " (u'securities', 1.1681694680548835),\n", - " (u'status', 1.1681694680548835),\n", - " (u'trousers', 1.1681694680548835),\n", - " (u'unpublished', 1.1681694680548835),\n", - " (u'berry', 1.1644932670010606),\n", - " (u'pp', 1.1608077284905565),\n", - " (u'hanks', 1.1587285139891437),\n", - " (u'mcclure', 1.1537352404836496),\n", - " (u'her', 1.1531891574151381),\n", - " (u'hamlin', 1.1529222466025137),\n", - " (u'speeches', 1.1437050469373577),\n", - " (u'kentucky', 1.1401563236722736),\n", - " (u'johnston', 1.1368073989967304),\n", - " (u'offutt', 1.1345503657246403),\n", - " (u'dress', 1.1343080503770544),\n", - " (u'german', 1.1343080503770544),\n", - " (u'matheney', 1.1343080503770544),\n", - " (u'company', 1.1298148326748745),\n", - " (u'g', 1.128517881924167),\n", - " (u'votes', 1.1187730676938106),\n", - " (u'nine', 1.113374076177045),\n", - " (u'charles', 1.1065580194728426),\n", - " (u'note', 1.0974655406391749),\n", - " (u'deed', 1.0970926363431248),\n", - " (u'east', 1.0970926363431248),\n", - " (u'spurious', 1.0970926363431248),\n", - " (u'atkinson', 1.0970926363431244),\n", - " (u'comply', 1.0970926363431244),\n", - " (u'jewelers', 1.0970926363431244),\n", - " (u'leland', 1.0970926363431244),\n", - " (u'priest', 1.0970926363431244),\n", - " (u'soldier', 1.0970926363431244),\n", - " (u'd', 1.0936709970367389),\n", - " (u'tax', 1.0890978328098568),\n", - " (u'colonel', 1.0886122317272675),\n", - " (u'pitcher', 1.0886122317272675),\n", - " (u'spink', 1.0886122317272675),\n", - " (u'charter', 1.0886122317272673),\n", - " (u'clock', 1.0886122317272673),\n", - " (u'distribute', 1.0886122317272673),\n", - " (u'fisher', 1.0886122317272673),\n", - " (u'convention', 1.0842245322470756),\n", - " (u'plaintiff', 1.0813648643938589),\n", - " (u'island', 1.0791112857988696),\n", - " (u'voyage', 1.0772490318253176),\n", - " (u'you', 1.0716742799027257),\n", - " (u'road', 1.0587290524017576),\n", - " (u'holland', 1.05373524048365),\n", - " (u'trailor', 1.0479900750043671),\n", - " (u'limited', 1.0447190713617185),\n", - " (u'domain', 1.0399471126066209),\n", - " (u'grandfather', 1.0399471126066209),\n", - " (u'voted', 1.0399471126066209),\n", - " (u'agree', 1.0367857078081339),\n", - " (u'including', 1.0367857078081339),\n", - " (u'life', 1.0279778291629844),\n", - " (u'witness', 1.0249646422762066),\n", - " (u'james', 1.0153080476245506),\n", - " (u'stuart', 1.0149104889383316),\n", - " (u'dungee', 1.0102738780733427),\n", - " (u'john', 1.0074378828094916),\n", - " (u'surveyor', 1.0071083505332288),\n", - " (u'cross', 1.0008479040802145),\n", - " (u'dollars', 1.0002448365299736),\n", - " (u'president', 0.99828026284480487),\n", - " (u'_amount_', 0.99450922395310026),\n", - " (u'_black', 0.99450922395310026),\n", - " (u'_commercial', 0.99450922395310026),\n", - " (u'_magazine', 0.99450922395310026),\n", - " (u'_nicolay', 0.99450922395310026),\n", - " (u'_north', 0.99450922395310026),\n", - " (u'_sun_', 0.99450922395310026),\n", - " (u'accompanies', 0.99450922395310026),\n", - " (u'accordance', 0.99450922395310026),\n", - " (u'adjourning', 0.99450922395310026),\n", - " (u'advertiser', 0.99450922395310026),\n", - " (u'advertiser_', 0.99450922395310026),\n", - " (u'agnosticism', 0.99450922395310026),\n", - " (u'almanacs', 0.99450922395310026),\n", - " (u'animals', 0.99450922395310026),\n", - " (u'apparel', 0.99450922395310026),\n", - " (u'appoints', 0.99450922395310026),\n", - " (u'arbitrations', 0.99450922395310026),\n", - " (u'ascii', 0.99450922395310026),\n", - " (u'asks', 0.99450922395310026),\n", - " (u'aspirants', 0.99450922395310026),\n", - " (u'atrocious', 0.99450922395310026),\n", - " (u'attachment', 0.99450922395310026),\n", - " (u'authors', 0.99450922395310026),\n", - " (u'band', 0.99450922395310026),\n", - " (u'bargained', 0.99450922395310026),\n", - " (u'bets', 0.99450922395310026),\n", - " (u'bleeding', 0.99450922395310026),\n", - " (u'boats', 0.99450922395310026),\n", - " (u'book_', 0.99450922395310026),\n", - " (u'boss', 0.99450922395310026),\n", - " (u'bourgeois', 0.99450922395310026),\n", - " (u'bull', 0.99450922395310026),\n", - " (u'calf', 0.99450922395310026),\n", - " (u'chase', 0.99450922395310026),\n", - " (u'chicanery', 0.99450922395310026),\n", - " (u'coach', 0.99450922395310026),\n", - " (u'coins', 0.99450922395310026),\n", - " (u'comet', 0.99450922395310026),\n", - " (u'computer', 0.99450922395310026),\n", - " (u'computers', 0.99450922395310026),\n", - " (u'concentration', 0.99450922395310026),\n", - " (u'conquering', 0.99450922395310026),\n", - " (u'conservator', 0.99450922395310026),\n", - " (u'contentedly', 0.99450922395310026),\n", - " (u'copied', 0.99450922395310026),\n", - " (u'cord', 0.99450922395310026),\n", - " (u'cornell', 0.99450922395310026),\n", - " (u'countenance', 0.99450922395310026),\n", - " (u'counting', 0.99450922395310026),\n", - " (u'countryman', 0.99450922395310026),\n", - " (u'creeks', 0.99450922395310026),\n", - " (u'davy', 0.99450922395310026),\n", - " (u'deer', 0.99450922395310026),\n", - " (u'def', 0.99450922395310026),\n", - " (u'delegations', 0.99450922395310026),\n", - " (u'deliveries', 0.99450922395310026),\n", - " (u'demurrer', 0.99450922395310026),\n", - " (u'desires', 0.99450922395310026),\n", - " (u'detriment', 0.99450922395310026),\n", - " (u'directors', 0.99450922395310026),\n", - " (u'disallows', 0.99450922395310026),\n", - " (u'disgracing', 0.99450922395310026),\n", - " (u'doctoring', 0.99450922395310026),\n", - " (u'effectively', 0.99450922395310026),\n", - " (u'elections', 0.99450922395310026),\n", - " (u'electronically', 0.99450922395310026),\n", - " (u'enrolling', 0.99450922395310026),\n", - " (u'exempt', 0.99450922395310026),\n", - " (u'faded', 0.99450922395310026),\n", - " (u'fares', 0.99450922395310026),\n", - " (u'ff', 0.99450922395310026),\n", - " (u'fights', 0.99450922395310026),\n", - " (u'flatboat', 0.99450922395310026),\n", - " (u'founded', 0.99450922395310026),\n", - " (u'generals', 0.99450922395310026),\n", - " (u'goose', 0.99450922395310026),\n", - " (u'greed', 0.99450922395310026),\n", - " (u'groomsman', 0.99450922395310026),\n", - " (u'hagerty', 0.99450922395310026),\n", - " (u'hans', 0.99450922395310026),\n", - " (u'harvard', 0.99450922395310026),\n", - " (u'haute', 0.99450922395310026),\n", - " (u'heel', 0.99450922395310026),\n", - " (u'history_', 0.99450922395310026),\n", - " (u'homeliest', 0.99450922395310026),\n", - " (u'howard', 0.99450922395310026),\n", - " (u'hut', 0.99450922395310026),\n", - " (u'ice', 0.99450922395310026),\n", - " (u'ida', 0.99450922395310026),\n", - " (u'identical', 0.99450922395310026),\n", - " (u'imperialist', 0.99450922395310026),\n", - " (u'independent', 0.99450922395310026),\n", - " (u'invalid', 0.99450922395310026),\n", - " (u'irons', 0.99450922395310026),\n", - " (u'janet', 0.99450922395310026),\n", - " (u'justification', 0.99450922395310026),\n", - " (u'lamborn', 0.99450922395310026),\n", - " (u'lambs', 0.99450922395310026),\n", - " (u'larceny', 0.99450922395310026),\n", - " (u'latin', 0.99450922395310026),\n", - " (u'linen', 0.99450922395310026),\n", - " (u'locations', 0.99450922395310026),\n", - " (u'louder', 0.99450922395310026),\n", - " (u'mad', 0.99450922395310026),\n", - " (u'magruder', 0.99450922395310026),\n", - " (u'maid', 0.99450922395310026),\n", - " (u'metaphysical', 0.99450922395310026),\n", - " (u'mit', 0.99450922395310026),\n", - " (u'monthlies', 0.99450922395310026),\n", - " (u'nest', 0.99450922395310026),\n", - " (u'nigger', 0.99450922395310026),\n", - " (u'package', 0.99450922395310026),\n", - " (u'pan', 0.99450922395310026),\n", - " (u'parentage', 0.99450922395310026),\n", - " (u'partial', 0.99450922395310026),\n", - " (u'partly', 0.99450922395310026),\n", - " (u'passengers', 0.99450922395310026),\n", - " (u'pension', 0.99450922395310026),\n", - " (u'pl', 0.99450922395310026),\n", - " (u'playful', 0.99450922395310026),\n", - " (u'population', 0.99450922395310026),\n", - " (u'postponed', 0.99450922395310026),\n", - " (u'postponement', 0.99450922395310026),\n", - " (u'premise', 0.99450922395310026),\n", - " (u'pressure', 0.99450922395310026),\n", - " (u'presumption', 0.99450922395310026),\n", - " (u'preventing', 0.99450922395310026),\n", - " (u'quart', 0.99450922395310026),\n", - " (u'quincy', 0.99450922395310026),\n", - " (u'quorum', 0.99450922395310026),\n", - " (u'redistribution', 0.99450922395310026),\n", - " (u'rejoicing', 0.99450922395310026),\n", - " (u'remit', 0.99450922395310026),\n", - " (u'rifle', 0.99450922395310026),\n", - " (u'romance', 0.99450922395310026),\n", - " (u'rothschild_', 0.99450922395310026),\n", - " (u'row', 0.99450922395310026),\n", - " (u'rubbish', 0.99450922395310026),\n", - " (u'sacrifices', 0.99450922395310026),\n", - " (u'scroll', 0.99450922395310026),\n", - " (u'shade', 0.99450922395310026),\n", - " (u'shed', 0.99450922395310026),\n", - " (u'sigh', 0.99450922395310026),\n", - " (u'silk', 0.99450922395310026),\n", - " (u'sinewy', 0.99450922395310026),\n", - " (u'sock', 0.99450922395310026),\n", - " (u'solicit', 0.99450922395310026),\n", - " (u'solvent', 0.99450922395310026),\n", - " (u'sonny', 0.99450922395310026),\n", - " (u'startling', 0.99450922395310026),\n", - " (u'steals', 0.99450922395310026),\n", - " (u'steamer', 0.99450922395310026),\n", - " (u'stevenson', 0.99450922395310026),\n", - " (u'subp\\u0153naed', 0.99450922395310026),\n", - " (u'tanned', 0.99450922395310026),\n", - " (u'tea', 0.99450922395310026),\n", - " (u'terre', 0.99450922395310026),\n", - " (u'theosophy', 0.99450922395310026),\n", - " (u'tight', 0.99450922395310026),\n", - " (u'tis', 0.99450922395310026),\n", - " (u'tour', 0.99450922395310026),\n", - " (u'vanilla', 0.99450922395310026),\n", - " (u'vol', 0.99450922395310026),\n", - " (u'warfare', 0.99450922395310026),\n", - " (u'warranty', 0.99450922395310026),\n", - " (u'wayne', 0.99450922395310026),\n", - " (u'whip', 0.99450922395310026),\n", - " (u'woodcut', 0.99450922395310026),\n", - " (u'wright', 0.99450922395310026),\n", - " (u'new', 0.99212250974463601)]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mz_keywords(text,scores=True,weighted=False,threshold='auto')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The expected complexity of this algorithm is **O**(*Nw*), where *N* is the number of words in the text, and *w* is the number of unique words in the text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 195c3c016208c18a0d00cd109574070aeb44550f Mon Sep 17 00:00:00 2001 From: Pete Date: Mon, 27 Nov 2017 14:51:55 +0000 Subject: [PATCH 06/26] Summarization tutorial --- .../summarization_tutorial-checkpoint.ipynb | 209 +++++-- docs/notebooks/summarization_tutorial.ipynb | 512 ++++++++++++++++++ 2 files changed, 670 insertions(+), 51 deletions(-) create mode 100644 docs/notebooks/summarization_tutorial.ipynb diff --git a/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb index 3e4c3f1302..20fdd925b0 100644 --- a/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb +++ b/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb @@ -22,11 +22,21 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,358 : INFO : Pattern library is not installed, lemmatization won't be available.\n", + "2016-09-19 12:45:22,361 : INFO : Could not import Theano, will use standard float for default ShardedCorpus dtype.\n", + "2016-09-19 12:45:22,372 : INFO : 'pattern' package not found; tag filters are not available for English\n" + ] + } + ], "source": [ "import logging\n", "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", @@ -43,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -73,8 +83,8 @@ " \"programs devoted to snuffing out Neo and the entire human \" + \\\n", " \"rebellion. \"\n", "\n", - "print 'Input text:'\n", - "print text" + "print ('Input text:')\n", + "print (text)" ] }, { @@ -88,23 +98,33 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,405 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,405 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,406 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,406 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "Summary:\n", - "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n" + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" ] } ], "source": [ - "print 'Summary:'\n", - "print summarize(text)" + "print ('Summary:')\n", + "print (summarize(text))" ] }, { @@ -116,21 +136,31 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,428 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,428 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,429 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,430 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "['By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.']\n" + "[\"Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\"]\n" ] } ], "source": [ - "print summarize(text, split=True)" + "print (summarize(text, split=True))" ] }, { @@ -142,11 +172,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,446 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,446 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,447 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,447 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -159,8 +199,8 @@ } ], "source": [ - "print 'Summary:'\n", - "print summarize(text, ratio=0.5)" + "print ('Summary:')\n", + "print (summarize(text, ratio=0.5))" ] }, { @@ -172,23 +212,33 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,463 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,464 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,464 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,465 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ "Summary:\n", - "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n" + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" ] } ], "source": [ - "print 'Summary:'\n", - "print summarize(text, word_count=50)" + "print ('Summary:')\n", + "print (summarize(text, word_count=50))" ] }, { @@ -200,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -215,16 +265,16 @@ "neo\n", "humans body\n", "super\n", - "hacker\n", - "reality\n" + "reality\n", + "hacker\n" ] } ], "source": [ "from gensim.summarization import keywords\n", "\n", - "print 'Keywords:'\n", - "print keywords(text)" + "print ('Keywords:')\n", + "print (keywords(text))" ] }, { @@ -240,11 +290,20 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,510 : INFO : Starting new HTTP connection (1): rare-technologies.com\n", + "2016-09-19 12:45:23,035 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:23,042 : INFO : built Dictionary(1093 unique tokens: ['realiti', 'keanu', 'miseri', 'vestig', 'massiv']...) from 416 documents (total 2985 corpus positions)\n" + ] + }, { "name": "stdout", "output_type": "stream", @@ -274,11 +333,11 @@ "\n", "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", "\n", - "print 'Summary:'\n", - "print summarize(text, ratio=0.01)\n", + "print ('Summary:')\n", + "print (summarize(text, ratio=0.01))\n", "\n", - "print '\\nKeywords:'\n", - "print keywords(text, ratio=0.01)" + "print ('\\nKeywords:')\n", + "print (keywords(text, ratio=0.01))" ] }, { @@ -296,28 +355,16 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "collapsed": false }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Summary:\n", - "The answering machine records a woman introducing herself as Maude Lebowski and saying that she is the one who took his rug and has sent a car to pick Dude up at his apartment.\n", - "As he climbs out of bed to make a White Russian, Maude asks about the apartment and Dude explains that Treehorn's thugs most likely vandalized it looking for Lebowski's money.\n", - "\n", - "Keywords:\n", - "dude\n", - "dudes\n", - "walter\n", - "lebowski\n", - "brandt\n", - "maude\n", - "donny\n", - "bunny\n" + "2016-09-19 12:45:25,227 : INFO : Starting new HTTP connection (1): rare-technologies.com\n" ] } ], @@ -326,11 +373,11 @@ "\n", "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\n", "\n", - "print 'Summary:'\n", - "print summarize(text, ratio=0.01)\n", + "print ('Summary:')\n", + "print (summarize(text, ratio=0.01))\n", "\n", - "print '\\nKeywords:'\n", - "print keywords(text, ratio=0.01)" + "print ('\\nKeywords:')\n", + "print (keywords(text, ratio=0.01))" ] }, { @@ -367,8 +414,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Montemurro and Zanette's Entropy-based keyword algorithm\n", - "[This algorithm](https://arxiv.org/abs/0907.1558) finds keywords based on their contribution to the structure of the document on large scales. " + "## Montemurro and Zanette's entropy based keyword extraction algorithm\n", + "\n", + "[This paper](https://arxiv.org/abs/0907.1558) describes a technique to identify words that play a significant role in the large-scale structure of a text. These typically correspond to the major themes of the text. The text is divided into blocks of ~1000 words, and the entropy of each word's distribution amongst the blocks is\n", + "caclulated and compared with the expected entropy if the word were distributed randomly." ] }, { @@ -378,7 +427,65 @@ "collapsed": true }, "outputs": [], - "source": [] + "source": [ + "import requests\n", + "from gensim.summarization import mz_keywords" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "text=requests.get(\"http://www.gutenberg.org/files/49679/49679-0.txt\").text\n", + "mz_keywords(text,scores=True,threshold=0.005)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the algorithm weights the entropy by the overall frequency of the word in the document. We can remove this weighting by setting weighted=False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mz_keywords(text,scores=True,weighted=False,threshold=1.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When this option is used, it is possible to calculate a threshold automatically from the number of blocks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mz_keywords(text,scores=True,weighted=False,threshold=\"auto\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The complexity of the algorithm is **O**(*Nw*), where *N* is the number of words in the document and *w* is the number of unique words." + ] } ], "metadata": { diff --git a/docs/notebooks/summarization_tutorial.ipynb b/docs/notebooks/summarization_tutorial.ipynb new file mode 100644 index 0000000000..20fdd925b0 --- /dev/null +++ b/docs/notebooks/summarization_tutorial.ipynb @@ -0,0 +1,512 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Tutorial: automatic summarization using Gensim

\n", + "\n", + "This module automatically summarizes the given text, by extracting one or more important sentences from the text. In a similar way, it can also extract keywords. This tutorial will teach you to use this summarization module via some examples. First, we will try a small example, then we will try two larger ones, and then we will review the performance of the summarizer in terms of speed.\n", + "\n", + "This summarizer is based on the \"TextRank\" algorithm, from an [article](http://web.eecs.umich.edu/%7Emihalcea/papers/mihalcea.emnlp04.pdf) by Mihalcea et al. This algorithm was later improved upon by Barrios et al. in another [article](https://raw.githubusercontent.com/summanlp/docs/master/articulo/articulo-en.pdf), by introducing something called a \"BM25 ranking function\". \n", + "\n", + "This tutorial assumes that you are familiar with Python and have [installed Gensim](http://radimrehurek.com/gensim/install.html).\n", + "\n", + "Note: Gensim's summarization only works for English for now, because the text is pre-processed so that stopwords are removed and the words are stemmed, and these processes are language-dependent.\n", + "\n", + "\n", + "

Small example

\n", + "\n", + "First of all, we import the function \"summarize\"." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,358 : INFO : Pattern library is not installed, lemmatization won't be available.\n", + "2016-09-19 12:45:22,361 : INFO : Could not import Theano, will use standard float for default ShardedCorpus dtype.\n", + "2016-09-19 12:45:22,372 : INFO : 'pattern' package not found; tag filters are not available for English\n" + ] + } + ], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", + "\n", + "from gensim.summarization import summarize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input text:\n", + "Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion. \n" + ] + } + ], + "source": [ + "text = \"Thomas A. Anderson is a man living two lives. By day he is an \" + \\\n", + " \"average computer programmer and by night a hacker known as \" + \\\n", + " \"Neo. Neo has always questioned his reality, but the truth is \" + \\\n", + " \"far beyond his imagination. Neo finds himself targeted by the \" + \\\n", + " \"police when he is contacted by Morpheus, a legendary computer \" + \\\n", + " \"hacker branded a terrorist by the government. Morpheus awakens \" + \\\n", + " \"Neo to the real world, a ravaged wasteland where most of \" + \\\n", + " \"humanity have been captured by a race of machines that live \" + \\\n", + " \"off of the humans' body heat and electrochemical energy and \" + \\\n", + " \"who imprison their minds within an artificial reality known as \" + \\\n", + " \"the Matrix. As a rebel against the machines, Neo must return to \" + \\\n", + " \"the Matrix and confront the agents: super-powerful computer \" + \\\n", + " \"programs devoted to snuffing out Neo and the entire human \" + \\\n", + " \"rebellion. \"\n", + "\n", + "print ('Input text:')\n", + "print (text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To summarize this text, we pass the raw string data as input to the function \"summarize\", and it will return a summary.\n", + "\n", + "Note: make sure that the string does not contain any newlines where the line breaks in a sentence. A sentence with a newline in it (i.e. a carriage return, \"\\n\") will be treated as two sentences." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,405 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,405 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,406 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,406 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" + ] + } + ], + "source": [ + "print ('Summary:')\n", + "print (summarize(text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the \"split\" option if you want a list of strings instead of a single string." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,428 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,428 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,429 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,430 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\"]\n" + ] + } + ], + "source": [ + "print (summarize(text, split=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can adjust how much text the summarizer outputs via the \"ratio\" parameter or the \"word_count\" parameter. Using the \"ratio\" parameter, you specify what fraction of sentences in the original text should be returned as output. Below we specify that we want 50% of the original text (the default is 20%)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,446 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,446 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,447 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,447 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n", + "Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government.\n", + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" + ] + } + ], + "source": [ + "print ('Summary:')\n", + "print (summarize(text, ratio=0.5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the \"word_count\" parameter, we specify the maximum amount of words we want in the summary. Below we have specified that we want no more than 50 words." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,463 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2016-09-19 12:45:22,464 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:22,464 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", + "2016-09-19 12:45:22,465 : WARNING : Input corpus is expected to have at least 10 documents.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" + ] + } + ], + "source": [ + "print ('Summary:')\n", + "print (summarize(text, word_count=50))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As mentioned earlier, this module also supports keyword extraction. Keyword extraction works in the same way as summary generation (i.e. sentence extraction), in that the algorithm tries to find words that are important or seem representative of the entire text. They keywords are not always single words; in the case of multi-word keywords, they are typically all nouns." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Keywords:\n", + "humanity\n", + "human\n", + "neo\n", + "humans body\n", + "super\n", + "reality\n", + "hacker\n" + ] + } + ], + "source": [ + "from gensim.summarization import keywords\n", + "\n", + "print ('Keywords:')\n", + "print (keywords(text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Larger example

\n", + "\n", + "Let us try an example with a larger piece of text. We will be using a synopsis of the movie \"The Matrix\", which we have taken from [this](http://www.imdb.com/title/tt0133093/synopsis?ref_=ttpl_pl_syn) IMDb page.\n", + "\n", + "In the code below, we read the text file directly from a web-page using \"requests\". Then we produce a summary and some keywords." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:22,510 : INFO : Starting new HTTP connection (1): rare-technologies.com\n", + "2016-09-19 12:45:23,035 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2016-09-19 12:45:23,042 : INFO : built Dictionary(1093 unique tokens: ['realiti', 'keanu', 'miseri', 'vestig', 'massiv']...) from 416 documents (total 2985 corpus positions)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "Anderson, a software engineer for a Metacortex, the other life as Neo, a computer hacker \"guilty of virtually every computer crime we have a law for.\" Agent Smith asks him to help them capture Morpheus, a dangerous terrorist, in exchange for amnesty.\n", + "Morpheus explains that he's been searching for Neo his entire life and asks if Neo feels like \"Alice in Wonderland, falling down the rabbit hole.\" He explains to Neo that they exist in the Matrix, a false reality that has been constructed for humans to hide the truth.\n", + "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse (Matt Doran).\n", + "Cypher cuts up a juicy steak and ruminates that he knows the steak is merely the simulation telling his brain that it is delicious and juicy, but after nine years he has discovered that \"ignorance is bliss.\" He strikes a deal for the machines to reinsert his body into a power plant, reinsert him into the Matrix, and he'll help the Agents.\n", + "\n", + "Keywords:\n", + "neo\n", + "morpheus\n", + "trinity\n", + "cypher\n", + "agents\n", + "agent\n", + "smith\n", + "tank\n", + "says\n", + "saying\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", + "\n", + "print ('Summary:')\n", + "print (summarize(text, ratio=0.01))\n", + "\n", + "print ('\\nKeywords:')\n", + "print (keywords(text, ratio=0.01))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you know this movie, you see that this summary is actually quite good. We also see that some of the most important characters (Neo, Morpheus, Trinity) were extracted as keywords.\n", + "\n", + "

Another example

\n", + "\n", + "Let's try an example similar to the one above. This time, we will use the [IMDb synopsis](http://www.imdb.com/title/tt0118715/synopsis?ref_=tt_stry_pl) of \"The Big Lebowski\".\n", + "\n", + "Again, we download the text and produce a summary and some keywords." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2016-09-19 12:45:25,227 : INFO : Starting new HTTP connection (1): rare-technologies.com\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\n", + "\n", + "print ('Summary:')\n", + "print (summarize(text, ratio=0.01))\n", + "\n", + "print ('\\nKeywords:')\n", + "print (keywords(text, ratio=0.01))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time around, the summary is not of high quality, as it does not tell us much about the movie. In a way, this might not be the algorithms fault, rather this text simply doesn't contain one or two sentences that capture the essence of the text as in \"The Matrix\" synopsis.\n", + "\n", + "The keywords, however, managed to find some of the main characters.\n", + "\n", + "

Performance

\n", + "\n", + "We will test how the speed of the summarizer scales with the size of the dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 processor. Note that the summarizer does not support multithreading (parallel processing).\n", + "\n", + "The tests were run on the book \"Honest Abe\" by Alonzo Rothschild. Download the book in plain-text here. \n", + "\n", + "In the plot below, we see the running times together with the sizes of the datasets. To create datasets of different sizes, we have simply taken prefixes of text; in other words we take the first n characters of the book. The algorithm seems to be quadratic in time, so one needs to be careful before plugging a large dataset into the summarizer.\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "\n", + "

Text-content dependent running times

\n", + "\n", + "The running time is not only dependent on the size of the dataset. For example, summarizing \"The Matrix\" synopsis (about 36,000 characters) takes about 3.1 seconds, while summarizing 35,000 characters of this book takes about 8.5 seconds. So the former is more than twice as fast. \n", + "\n", + "One reason for this difference in running times is the data structure that is used. The algorithm represents the data using a graph, where vertices (nodes) are sentences, and then constructs weighted edges between the vertices that represent how the sentences relate to each other. This means that every piece of text will have a different graph, thus making the running times different. The size of this data structure is quadratic in the worst case (the worst case is when each vertex has an edge to every other vertex).\n", + "\n", + "Another possible reason for the difference in running times is that the problems converge at different rates, meaning that the error drops slower for some datasets than for others.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Montemurro and Zanette's entropy based keyword extraction algorithm\n", + "\n", + "[This paper](https://arxiv.org/abs/0907.1558) describes a technique to identify words that play a significant role in the large-scale structure of a text. These typically correspond to the major themes of the text. The text is divided into blocks of ~1000 words, and the entropy of each word's distribution amongst the blocks is\n", + "caclulated and compared with the expected entropy if the word were distributed randomly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import requests\n", + "from gensim.summarization import mz_keywords" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "text=requests.get(\"http://www.gutenberg.org/files/49679/49679-0.txt\").text\n", + "mz_keywords(text,scores=True,threshold=0.005)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the algorithm weights the entropy by the overall frequency of the word in the document. We can remove this weighting by setting weighted=False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mz_keywords(text,scores=True,weighted=False,threshold=1.0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When this option is used, it is possible to calculate a threshold automatically from the number of blocks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mz_keywords(text,scores=True,weighted=False,threshold=\"auto\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The complexity of the algorithm is **O**(*Nw*), where *N* is the number of words in the document and *w* is the number of unique words." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 5b9a3ad8a4ee10b4bb60e8ef096bec6fceb0ec68 Mon Sep 17 00:00:00 2001 From: Pete Date: Mon, 27 Nov 2017 15:56:44 +0000 Subject: [PATCH 07/26] Fixed some failing tests --- gensim/summarization/mz_entropy.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index e60cce8e0e..6aa2ba04e6 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -35,7 +35,7 @@ def mz_keywords(text, minimum score for returned keywords, default 0.0 'auto' calculates the threshold as nblocks / (nblocks + 1.0) + 1.0e-8 Use 'auto' with weighted=False) - + Returns ------- results: str @@ -46,13 +46,13 @@ def mz_keywords(text, list of (keyword, score) tuples if scores is True Results are returned in descending order of score regardless of the format. - + Notes ----- This algorithm looks for keywords that contribute to the structure of the text on scales of blocksize words of larger. It is suitable for extracting keywords representing the major themes of long texts. - + References ---------- [1] Marcello A Montemurro, Damian Zanette, @@ -61,7 +61,7 @@ def mz_keywords(text, Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153 DOI: 10.1142/S0219525910002530 https://arxiv.org/abs/0907.1558 - + """ text = to_unicode(text) words = [word for word in _tokenize_by_word(text)] @@ -76,7 +76,7 @@ def mz_keywords(text, nwords = totals.sum() p = wordcounts / totals logp = numpy.log2(p) - H = numpy.nan_to_num((p * logp), 0.0).sum(axis=0) + H = numpy.nan_to_num(p * logp).sum(axis=0) analytic = __analytic_entropy(blocksize, nblocks, nwords) H += analytic(totals).astype('d') if weighted: @@ -92,7 +92,6 @@ def mz_keywords(text, result = '\n'.join(result) return result - def __log_combinations_inner(n, m): """Calculates the logarithm of n!/m!(n-m)!""" return -(numpy.log(n+1)+scipy.special.betaln(n-m+1,m+1)) From 4c2d8de7f4eee16af8137c361d7971a4d3200315 Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 28 Nov 2017 09:54:56 +0000 Subject: [PATCH 08/26] Tests, demo, nan_to_num and a few last flake8 issues --- docs/notebooks/summarization_tutorial.ipynb | 885 +++++++++++++++++++- gensim/summarization/mz_entropy.py | 37 +- gensim/test/test_summarization.py | 6 +- 3 files changed, 869 insertions(+), 59 deletions(-) diff --git a/docs/notebooks/summarization_tutorial.ipynb b/docs/notebooks/summarization_tutorial.ipynb index 20fdd925b0..62e098cbfa 100644 --- a/docs/notebooks/summarization_tutorial.ipynb +++ b/docs/notebooks/summarization_tutorial.ipynb @@ -31,9 +31,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,358 : INFO : Pattern library is not installed, lemmatization won't be available.\n", - "2016-09-19 12:45:22,361 : INFO : Could not import Theano, will use standard float for default ShardedCorpus dtype.\n", - "2016-09-19 12:45:22,372 : INFO : 'pattern' package not found; tag filters are not available for English\n" + "2017-11-28 08:32:40,713 : INFO : 'pattern' package not found; tag filters are not available for English\n" ] } ], @@ -107,10 +105,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,405 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,405 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,406 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,406 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-11-28 08:32:40,736 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-28 08:32:40,737 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-28 08:32:40,738 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-28 08:32:40,738 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -145,10 +143,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,428 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,428 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,429 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,430 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-11-28 08:32:40,748 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-28 08:32:40,749 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-28 08:32:40,750 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-28 08:32:40,751 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -181,10 +179,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,446 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,446 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,447 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,447 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-11-28 08:32:40,761 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-28 08:32:40,761 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-28 08:32:40,762 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-28 08:32:40,763 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -221,10 +219,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,463 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,464 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,464 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,465 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-11-28 08:32:40,774 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-11-28 08:32:40,775 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-28 08:32:40,776 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-11-28 08:32:40,777 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -265,8 +263,8 @@ "neo\n", "humans body\n", "super\n", - "reality\n", - "hacker\n" + "hacker\n", + "reality\n" ] } ], @@ -290,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -299,9 +297,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,510 : INFO : Starting new HTTP connection (1): rare-technologies.com\n", - "2016-09-19 12:45:23,035 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:23,042 : INFO : built Dictionary(1093 unique tokens: ['realiti', 'keanu', 'miseri', 'vestig', 'massiv']...) from 416 documents (total 2985 corpus positions)\n" + "2017-11-28 08:32:41,320 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-28 08:32:41,326 : INFO : built Dictionary(1093 unique tokens: [u'code', u'squiddi', u'relai', u'dinosaur', u'electron']...) from 416 documents (total 2985 corpus positions)\n" ] }, { @@ -355,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "collapsed": false }, @@ -364,7 +361,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:25,227 : INFO : Starting new HTTP connection (1): rare-technologies.com\n" + "2017-11-28 08:32:43,682 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-11-28 08:32:43,687 : INFO : built Dictionary(1054 unique tokens: [u'fawn', u'windi', u'concept', u'doctor', u'gant']...) from 227 documents (total 2434 corpus positions)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "The answering machine records a woman introducing herself as Maude Lebowski and saying that she is the one who took his rug and has sent a car to pick Dude up at his apartment.\n", + "As he climbs out of bed to make a White Russian, Maude asks about the apartment and Dude explains that Treehorn's thugs most likely vandalized it looking for Lebowski's money.\n", + "\n", + "Keywords:\n", + "dude\n", + "dudes\n", + "walter\n", + "lebowski\n", + "brandt\n", + "maude\n", + "donny\n", + "bunny\n" ] } ], @@ -422,9 +439,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -434,14 +451,70 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/pete/gensim/gensim/summarization/mz_entropy.py:74: RuntimeWarning: divide by zero encountered in log2\n", + " logp = numpy.log2(p)\n", + "/home/pete/gensim/gensim/summarization/mz_entropy.py:75: RuntimeWarning: invalid value encountered in multiply\n", + " H = numpy.nan_to_num(p * logp).sum(axis=0)\n" + ] + }, + { + "data": { + "text/plain": [ + "[(u'lincoln', 0.0056009079527401728),\n", + " (u'i', 0.0048480807199453163),\n", + " (u'gutenberg', 0.0033118705607652456),\n", + " (u'you', 0.0033044241876850882),\n", + " (u'the', 0.003184223100952537),\n", + " (u'project', 0.0030400432599562814),\n", + " (u'v', 0.0029892072316233462),\n", + " (u's', 0.0027479946846166391),\n", + " (u'he', 0.0026405628272363011),\n", + " (u'iv', 0.0025895621076850355),\n", + " (u'ii', 0.0025019507619403148),\n", + " (u'by', 0.0022277723676676691),\n", + " (u'abraham', 0.0021168707666022494),\n", + " (u'or', 0.0020858843371172162),\n", + " (u'iii', 0.002071167621155823),\n", + " (u'tm', 0.0019565820396828327),\n", + " (u'was', 0.0018954215033062955),\n", + " (u'his', 0.0018126024538229718),\n", + " (u'work', 0.0017646814365061972),\n", + " (u'co', 0.0017416964820475558),\n", + " (u'case', 0.001661734006946057),\n", + " (u'new', 0.0016558607106467698),\n", + " (u'york', 0.0015861543846297651),\n", + " (u'court', 0.0014488333654852606),\n", + " (u'a', 0.0013369063978456374),\n", + " (u'it', 0.0013221654971075282),\n", + " (u'had', 0.0012652752682645698),\n", + " (u'on', 0.0012621040038518136),\n", + " (u'their', 0.0012449891448184512),\n", + " (u'herndon', 0.0012402952190743249),\n", + " (u'life', 0.00123104152062403),\n", + " (u'my', 0.0011741303053317792),\n", + " (u'_works_', 0.0010832651550141503),\n", + " (u'we', 0.0010768294653523067),\n", + " (u'money', 0.0010191083741917691),\n", + " (u'father', 0.0010168268194887184)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "text=requests.get(\"http://www.gutenberg.org/files/49679/49679-0.txt\").text\n", - "mz_keywords(text,scores=True,threshold=0.005)" + "mz_keywords(text,scores=True,threshold=0.001)" ] }, { @@ -453,11 +526,292 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'gutenberg', 3.7766363961259684),\n", + " (u'tm', 3.6403066998316511),\n", + " (u'project', 3.5428530523255342),\n", + " (u'co', 3.2983688146004528),\n", + " (u'donations', 2.8613536046553563),\n", + " (u'electronic', 2.8210861922674084),\n", + " (u'access', 2.7810662866642568),\n", + " (u'refund', 2.7810662866642568),\n", + " (u'foundation', 2.7234464816769872),\n", + " (u'foxboro', 2.5477601487545121),\n", + " (u'gloves', 2.5281337853661761),\n", + " (u'e', 2.4036269322210768),\n", + " (u'york', 2.3692008259770594),\n", + " (u'edited', 2.361641829495754),\n", + " (u'_works_', 2.3445174072327686),\n", + " (u'works', 2.3426500474551113),\n", + " (u'dogskin', 2.3425994588269479),\n", + " (u'ragsdale', 2.2931552327841351),\n", + " (u'replacement', 2.2931552327841351),\n", + " (u'trunks', 2.2931552327841351),\n", + " (u'iv', 2.2510299269025058),\n", + " (u'iii', 2.2186807817292546),\n", + " (u'v', 2.2168420707754368),\n", + " (u'brokaw', 2.1699176369612583),\n", + " (u'coon', 2.1699176369612583),\n", + " (u'bonds', 2.1343080503770544),\n", + " (u'license', 2.1009287665795293),\n", + " (u'ii', 2.0892470886183649),\n", + " (u'agreement', 2.0779209847210556),\n", + " (u'almanac', 2.0060727272918055),\n", + " (u'_weekly_', 1.9794475925140163),\n", + " (u'bounded', 1.9794475925140163),\n", + " (u'format', 1.9794475925140163),\n", + " (u'millions', 1.9794475925140163),\n", + " (u'oxen', 1.9794475925140163),\n", + " (u'specie', 1.9794475925140163),\n", + " (u'archive', 1.9682995275030786),\n", + " (u'barrett', 1.9422319940872796),\n", + " (u'reminiscences', 1.9330537427622287),\n", + " (u'ebooks', 1.8984698469769548),\n", + " (u'forquer', 1.8843080503770544),\n", + " (u'parker', 1.8843080503770544),\n", + " (u'pglaf', 1.8843080503770544),\n", + " (u'ebook', 1.8838775575675983),\n", + " (u'trademark', 1.8838775575675983),\n", + " (u'paragraph', 1.8301079379685583),\n", + " (u'hardin', 1.7669683658081703),\n", + " (u'work', 1.7328354724344326),\n", + " (u'rothschild', 1.7275730939964973),\n", + " (u'org', 1.7211393195188851),\n", + " (u'attitude', 1.716230650790012),\n", + " (u'london', 1.6791112857988695),\n", + " (u'boston', 1.6754810009833907),\n", + " (u'xvi', 1.66018729770736),\n", + " (u'news', 1.6601872977073597),\n", + " (u'biographical', 1.6294643147000225),\n", + " (u'green', 1.6254512602292723),\n", + " (u'delegates', 1.6127555612626692),\n", + " (u'medium', 1.6127555612626692),\n", + " (u'scripps', 1.6127555612626692),\n", + " (u'volunteers', 1.6127555612626692),\n", + " (u'lamon', 1.6001560607245646),\n", + " (u'tarbell', 1.5897346234235084),\n", + " (u'volumes', 1.5819481863246514),\n", + " (u'bank', 1.5744728128489647),\n", + " (u'copyright', 1.5731550611734115),\n", + " (u'_via_', 1.5722781569106761),\n", + " (u'admissibility', 1.5722781569106761),\n", + " (u'advertisers', 1.5722781569106761),\n", + " (u'applicable', 1.5722781569106761),\n", + " (u'attire', 1.5722781569106761),\n", + " (u'bags', 1.5722781569106761),\n", + " (u'berries', 1.5722781569106761),\n", + " (u'breeches', 1.5722781569106761),\n", + " (u'cline', 1.5722781569106761),\n", + " (u'continuance', 1.5722781569106761),\n", + " (u'currents', 1.5722781569106761),\n", + " (u'daguerreotype', 1.5722781569106761),\n", + " (u'disclaimer', 1.5722781569106761),\n", + " (u'email', 1.5722781569106761),\n", + " (u'enrolled', 1.5722781569106761),\n", + " (u'fool', 1.5722781569106761),\n", + " (u'guineas', 1.5722781569106761),\n", + " (u'hatchet', 1.5722781569106761),\n", + " (u'instruct', 1.5722781569106761),\n", + " (u'liability', 1.5722781569106761),\n", + " (u'lonny', 1.5722781569106761),\n", + " (u'paullin', 1.5722781569106761),\n", + " (u'performing', 1.5722781569106761),\n", + " (u'plow', 1.5722781569106761),\n", + " (u'polite', 1.5722781569106761),\n", + " (u'puffs', 1.5722781569106761),\n", + " (u'rulings', 1.5722781569106761),\n", + " (u'scammon', 1.5722781569106761),\n", + " (u'tilda', 1.5722781569106761),\n", + " (u'wake', 1.5722781569106761),\n", + " (u'warranties', 1.5722781569106761),\n", + " (u'america', 1.5712271378967728),\n", + " (u'clair', 1.5712271378967728),\n", + " (u'displaying', 1.5712271378967728),\n", + " (u'forgery', 1.5712271378967728),\n", + " (u'holder', 1.5712271378967728),\n", + " (u'posted', 1.5712271378967728),\n", + " (u'sketches', 1.5712271378967728),\n", + " (u'snow', 1.5712271378967728),\n", + " (u'wore', 1.5712271378967728),\n", + " (u'http', 1.5645865830262038),\n", + " (u'journalism', 1.5399471126066209),\n", + " (u'copy', 1.5258495075146912),\n", + " (u'_early', 1.5202411939312348),\n", + " (u'armstrong', 1.5106440743450187),\n", + " (u'railroad', 1.4938165623572677),\n", + " (u'ross', 1.489097832809857),\n", + " (u'pair', 1.4791112857988695),\n", + " (u'banks', 1.4791112857988693),\n", + " (u'irelan', 1.4791112857988693),\n", + " (u'scott', 1.4791112857988693),\n", + " (u'browne', 1.4764336408243595),\n", + " (u'abraham', 1.4577679329151634),\n", + " (u'publication', 1.4490612388306794),\n", + " (u'provide', 1.4490612388306792),\n", + " (u'chiniquy', 1.4275140308616106),\n", + " (u'literary', 1.4150354420715021),\n", + " (u'rr', 1.4070491486733681),\n", + " (u'axe', 1.3967912341407889),\n", + " (u'fence', 1.3967912341407889),\n", + " (u'genuine', 1.3967912341407889),\n", + " (u'life_', 1.3941370904272503),\n", + " (u'she', 1.3923582867044937),\n", + " (u'copper', 1.3828069220574104),\n", + " (u'distributing', 1.3828069220574104),\n", + " (u'saddle', 1.3828069220574104),\n", + " (u'sons', 1.3828069220574104),\n", + " (u'_life_', 1.373910241709706),\n", + " (u'calhoun', 1.373910241709706),\n", + " (u'mother', 1.3728688332198922),\n", + " (u'college', 1.3697302821858961),\n", + " (u'nicolay', 1.3633245760231363),\n", + " (u'whitney', 1.3627575629840512),\n", + " (u'philadelphia', 1.3540886863558637),\n", + " (u'sarah', 1.3540886863558634),\n", + " (u'vi', 1.3540886863558634),\n", + " (u'harrison', 1.3476159735283106),\n", + " (u'terms', 1.3426509824683515),\n", + " (u'herndon', 1.3421892681433798),\n", + " (u'improvement', 1.329344333012155),\n", + " (u'buckskin', 1.3222046383294666),\n", + " (u'sham', 1.3222046383294666),\n", + " (u'fee', 1.3158554460066139),\n", + " (u'generosity', 1.3144503596878891),\n", + " (u'moore', 1.3144503596878887),\n", + " (u'copies', 1.3127747798184011),\n", + " (u'p', 1.309088202039181),\n", + " (u'compliance', 1.2961309813666892),\n", + " (u'constable', 1.2961309813666892),\n", + " (u'currency', 1.2961309813666892),\n", + " (u'distribution', 1.2961309813666892),\n", + " (u'harvey', 1.2961309813666892),\n", + " (u'individual', 1.2961309813666892),\n", + " (u'revolutionary', 1.2961309813666892),\n", + " (u'brooks', 1.286562189794501),\n", + " (u'chicago', 1.2700186510810929),\n", + " (u'weems', 1.2659709073661847),\n", + " (u'february', 1.2574199029295277),\n", + " (u'information', 1.2487001310514776),\n", + " (u'bridge', 1.2326416539256813),\n", + " (u'resolution', 1.2268390166084573),\n", + " (u'stoddard', 1.2268390166084573),\n", + " (u'father', 1.2254034208363418),\n", + " (u'cartwright', 1.2157428532629155),\n", + " (u'houghton', 1.2157428532629155),\n", + " (u'publishing', 1.2157428532629155),\n", + " (u'describes', 1.2157428532629153),\n", + " (u'j', 1.2115310804189017),\n", + " (u'_stories_', 1.2049337080807629),\n", + " (u'september', 1.2030636155192291),\n", + " (u'boys', 1.1974364414369618),\n", + " (u'defendants', 1.1955861748361873),\n", + " (u'per', 1.1955861748361873),\n", + " (u'permission', 1.1955861748361873),\n", + " (u'uncle', 1.1955861748361873),\n", + " (u'thomas', 1.1924565577943991),\n", + " (u'trade', 1.1918333507609624),\n", + " (u'f', 1.1915163381561049),\n", + " (u'store', 1.189052998865439),\n", + " (u'notes', 1.1850922942502753),\n", + " (u'baker', 1.1828856976412236),\n", + " (u'baddeley', 1.1681694680548835),\n", + " (u'cogdal', 1.1681694680548835),\n", + " (u'copying', 1.1681694680548835),\n", + " (u'crafton', 1.1681694680548835),\n", + " (u'defect', 1.1681694680548835),\n", + " (u'donate', 1.1681694680548835),\n", + " (u'easier', 1.1681694680548835),\n", + " (u'editions', 1.1681694680548835),\n", + " (u'hawley', 1.1681694680548835),\n", + " (u'hitchcock', 1.1681694680548835),\n", + " (u'jake', 1.1681694680548835),\n", + " (u'jewelry', 1.1681694680548835),\n", + " (u'jurors', 1.1681694680548835),\n", + " (u'lightning', 1.1681694680548835),\n", + " (u'machine', 1.1681694680548835),\n", + " (u'paragraphs', 1.1681694680548835),\n", + " (u'pg', 1.1681694680548835),\n", + " (u'pork', 1.1681694680548835),\n", + " (u'retains', 1.1681694680548835),\n", + " (u'rod', 1.1681694680548835),\n", + " (u'securities', 1.1681694680548835),\n", + " (u'status', 1.1681694680548835),\n", + " (u'trousers', 1.1681694680548835),\n", + " (u'unpublished', 1.1681694680548835),\n", + " (u'berry', 1.1644932670010606),\n", + " (u'pp', 1.1608077284905565),\n", + " (u'hanks', 1.1587285139891437),\n", + " (u'mcclure', 1.1537352404836496),\n", + " (u'her', 1.1531891574151381),\n", + " (u'hamlin', 1.1529222466025137),\n", + " (u'speeches', 1.1437050469373577),\n", + " (u'kentucky', 1.1401563236722736),\n", + " (u'johnston', 1.1368073989967304),\n", + " (u'offutt', 1.1345503657246403),\n", + " (u'dress', 1.1343080503770544),\n", + " (u'german', 1.1343080503770544),\n", + " (u'matheney', 1.1343080503770544),\n", + " (u'company', 1.1298148326748745),\n", + " (u'g', 1.128517881924167),\n", + " (u'votes', 1.1187730676938106),\n", + " (u'nine', 1.113374076177045),\n", + " (u'charles', 1.1065580194728426),\n", + " (u'note', 1.0974655406391749),\n", + " (u'deed', 1.0970926363431248),\n", + " (u'east', 1.0970926363431248),\n", + " (u'spurious', 1.0970926363431248),\n", + " (u'atkinson', 1.0970926363431244),\n", + " (u'comply', 1.0970926363431244),\n", + " (u'jewelers', 1.0970926363431244),\n", + " (u'leland', 1.0970926363431244),\n", + " (u'priest', 1.0970926363431244),\n", + " (u'soldier', 1.0970926363431244),\n", + " (u'd', 1.0936709970367389),\n", + " (u'tax', 1.0890978328098568),\n", + " (u'colonel', 1.0886122317272675),\n", + " (u'pitcher', 1.0886122317272675),\n", + " (u'spink', 1.0886122317272675),\n", + " (u'charter', 1.0886122317272673),\n", + " (u'clock', 1.0886122317272673),\n", + " (u'distribute', 1.0886122317272673),\n", + " (u'fisher', 1.0886122317272673),\n", + " (u'convention', 1.0842245322470756),\n", + " (u'plaintiff', 1.0813648643938589),\n", + " (u'island', 1.0791112857988696),\n", + " (u'voyage', 1.0772490318253176),\n", + " (u'you', 1.0716742799027257),\n", + " (u'road', 1.0587290524017576),\n", + " (u'holland', 1.05373524048365),\n", + " (u'trailor', 1.0479900750043671),\n", + " (u'limited', 1.0447190713617185),\n", + " (u'domain', 1.0399471126066209),\n", + " (u'grandfather', 1.0399471126066209),\n", + " (u'voted', 1.0399471126066209),\n", + " (u'agree', 1.0367857078081339),\n", + " (u'including', 1.0367857078081339),\n", + " (u'life', 1.0279778291629844),\n", + " (u'witness', 1.0249646422762066),\n", + " (u'james', 1.0153080476245506),\n", + " (u'stuart', 1.0149104889383316),\n", + " (u'dungee', 1.0102738780733427),\n", + " (u'john', 1.0074378828094916),\n", + " (u'surveyor', 1.0071083505332288),\n", + " (u'cross', 1.0008479040802145),\n", + " (u'dollars', 1.0002448365299736)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "mz_keywords(text,scores=True,weighted=False,threshold=1.0)" ] @@ -471,11 +825,464 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { - "collapsed": true + "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'gutenberg', 3.7766363961259684),\n", + " (u'tm', 3.6403066998316511),\n", + " (u'project', 3.5428530523255342),\n", + " (u'co', 3.2983688146004528),\n", + " (u'donations', 2.8613536046553563),\n", + " (u'electronic', 2.8210861922674084),\n", + " (u'access', 2.7810662866642568),\n", + " (u'refund', 2.7810662866642568),\n", + " (u'foundation', 2.7234464816769872),\n", + " (u'foxboro', 2.5477601487545121),\n", + " (u'gloves', 2.5281337853661761),\n", + " (u'e', 2.4036269322210768),\n", + " (u'york', 2.3692008259770594),\n", + " (u'edited', 2.361641829495754),\n", + " (u'_works_', 2.3445174072327686),\n", + " (u'works', 2.3426500474551113),\n", + " (u'dogskin', 2.3425994588269479),\n", + " (u'ragsdale', 2.2931552327841351),\n", + " (u'replacement', 2.2931552327841351),\n", + " (u'trunks', 2.2931552327841351),\n", + " (u'iv', 2.2510299269025058),\n", + " (u'iii', 2.2186807817292546),\n", + " (u'v', 2.2168420707754368),\n", + " (u'brokaw', 2.1699176369612583),\n", + " (u'coon', 2.1699176369612583),\n", + " (u'bonds', 2.1343080503770544),\n", + " (u'license', 2.1009287665795293),\n", + " (u'ii', 2.0892470886183649),\n", + " (u'agreement', 2.0779209847210556),\n", + " (u'almanac', 2.0060727272918055),\n", + " (u'_weekly_', 1.9794475925140163),\n", + " (u'bounded', 1.9794475925140163),\n", + " (u'format', 1.9794475925140163),\n", + " (u'millions', 1.9794475925140163),\n", + " (u'oxen', 1.9794475925140163),\n", + " (u'specie', 1.9794475925140163),\n", + " (u'archive', 1.9682995275030786),\n", + " (u'barrett', 1.9422319940872796),\n", + " (u'reminiscences', 1.9330537427622287),\n", + " (u'ebooks', 1.8984698469769548),\n", + " (u'forquer', 1.8843080503770544),\n", + " (u'parker', 1.8843080503770544),\n", + " (u'pglaf', 1.8843080503770544),\n", + " (u'ebook', 1.8838775575675983),\n", + " (u'trademark', 1.8838775575675983),\n", + " (u'paragraph', 1.8301079379685583),\n", + " (u'hardin', 1.7669683658081703),\n", + " (u'work', 1.7328354724344326),\n", + " (u'rothschild', 1.7275730939964973),\n", + " (u'org', 1.7211393195188851),\n", + " (u'attitude', 1.716230650790012),\n", + " (u'london', 1.6791112857988695),\n", + " (u'boston', 1.6754810009833907),\n", + " (u'xvi', 1.66018729770736),\n", + " (u'news', 1.6601872977073597),\n", + " (u'biographical', 1.6294643147000225),\n", + " (u'green', 1.6254512602292723),\n", + " (u'delegates', 1.6127555612626692),\n", + " (u'medium', 1.6127555612626692),\n", + " (u'scripps', 1.6127555612626692),\n", + " (u'volunteers', 1.6127555612626692),\n", + " (u'lamon', 1.6001560607245646),\n", + " (u'tarbell', 1.5897346234235084),\n", + " (u'volumes', 1.5819481863246514),\n", + " (u'bank', 1.5744728128489647),\n", + " (u'copyright', 1.5731550611734115),\n", + " (u'_via_', 1.5722781569106761),\n", + " (u'admissibility', 1.5722781569106761),\n", + " (u'advertisers', 1.5722781569106761),\n", + " (u'applicable', 1.5722781569106761),\n", + " (u'attire', 1.5722781569106761),\n", + " (u'bags', 1.5722781569106761),\n", + " (u'berries', 1.5722781569106761),\n", + " (u'breeches', 1.5722781569106761),\n", + " (u'cline', 1.5722781569106761),\n", + " (u'continuance', 1.5722781569106761),\n", + " (u'currents', 1.5722781569106761),\n", + " (u'daguerreotype', 1.5722781569106761),\n", + " (u'disclaimer', 1.5722781569106761),\n", + " (u'email', 1.5722781569106761),\n", + " (u'enrolled', 1.5722781569106761),\n", + " (u'fool', 1.5722781569106761),\n", + " (u'guineas', 1.5722781569106761),\n", + " (u'hatchet', 1.5722781569106761),\n", + " (u'instruct', 1.5722781569106761),\n", + " (u'liability', 1.5722781569106761),\n", + " (u'lonny', 1.5722781569106761),\n", + " (u'paullin', 1.5722781569106761),\n", + " (u'performing', 1.5722781569106761),\n", + " (u'plow', 1.5722781569106761),\n", + " (u'polite', 1.5722781569106761),\n", + " (u'puffs', 1.5722781569106761),\n", + " (u'rulings', 1.5722781569106761),\n", + " (u'scammon', 1.5722781569106761),\n", + " (u'tilda', 1.5722781569106761),\n", + " (u'wake', 1.5722781569106761),\n", + " (u'warranties', 1.5722781569106761),\n", + " (u'america', 1.5712271378967728),\n", + " (u'clair', 1.5712271378967728),\n", + " (u'displaying', 1.5712271378967728),\n", + " (u'forgery', 1.5712271378967728),\n", + " (u'holder', 1.5712271378967728),\n", + " (u'posted', 1.5712271378967728),\n", + " (u'sketches', 1.5712271378967728),\n", + " (u'snow', 1.5712271378967728),\n", + " (u'wore', 1.5712271378967728),\n", + " (u'http', 1.5645865830262038),\n", + " (u'journalism', 1.5399471126066209),\n", + " (u'copy', 1.5258495075146912),\n", + " (u'_early', 1.5202411939312348),\n", + " (u'armstrong', 1.5106440743450187),\n", + " (u'railroad', 1.4938165623572677),\n", + " (u'ross', 1.489097832809857),\n", + " (u'pair', 1.4791112857988695),\n", + " (u'banks', 1.4791112857988693),\n", + " (u'irelan', 1.4791112857988693),\n", + " (u'scott', 1.4791112857988693),\n", + " (u'browne', 1.4764336408243595),\n", + " (u'abraham', 1.4577679329151634),\n", + " (u'publication', 1.4490612388306794),\n", + " (u'provide', 1.4490612388306792),\n", + " (u'chiniquy', 1.4275140308616106),\n", + " (u'literary', 1.4150354420715021),\n", + " (u'rr', 1.4070491486733681),\n", + " (u'axe', 1.3967912341407889),\n", + " (u'fence', 1.3967912341407889),\n", + " (u'genuine', 1.3967912341407889),\n", + " (u'life_', 1.3941370904272503),\n", + " (u'she', 1.3923582867044937),\n", + " (u'copper', 1.3828069220574104),\n", + " (u'distributing', 1.3828069220574104),\n", + " (u'saddle', 1.3828069220574104),\n", + " (u'sons', 1.3828069220574104),\n", + " (u'_life_', 1.373910241709706),\n", + " (u'calhoun', 1.373910241709706),\n", + " (u'mother', 1.3728688332198922),\n", + " (u'college', 1.3697302821858961),\n", + " (u'nicolay', 1.3633245760231363),\n", + " (u'whitney', 1.3627575629840512),\n", + " (u'philadelphia', 1.3540886863558637),\n", + " (u'sarah', 1.3540886863558634),\n", + " (u'vi', 1.3540886863558634),\n", + " (u'harrison', 1.3476159735283106),\n", + " (u'terms', 1.3426509824683515),\n", + " (u'herndon', 1.3421892681433798),\n", + " (u'improvement', 1.329344333012155),\n", + " (u'buckskin', 1.3222046383294666),\n", + " (u'sham', 1.3222046383294666),\n", + " (u'fee', 1.3158554460066139),\n", + " (u'generosity', 1.3144503596878891),\n", + " (u'moore', 1.3144503596878887),\n", + " (u'copies', 1.3127747798184011),\n", + " (u'p', 1.309088202039181),\n", + " (u'compliance', 1.2961309813666892),\n", + " (u'constable', 1.2961309813666892),\n", + " (u'currency', 1.2961309813666892),\n", + " (u'distribution', 1.2961309813666892),\n", + " (u'harvey', 1.2961309813666892),\n", + " (u'individual', 1.2961309813666892),\n", + " (u'revolutionary', 1.2961309813666892),\n", + " (u'brooks', 1.286562189794501),\n", + " (u'chicago', 1.2700186510810929),\n", + " (u'weems', 1.2659709073661847),\n", + " (u'february', 1.2574199029295277),\n", + " (u'information', 1.2487001310514776),\n", + " (u'bridge', 1.2326416539256813),\n", + " (u'resolution', 1.2268390166084573),\n", + " (u'stoddard', 1.2268390166084573),\n", + " (u'father', 1.2254034208363418),\n", + " (u'cartwright', 1.2157428532629155),\n", + " (u'houghton', 1.2157428532629155),\n", + " (u'publishing', 1.2157428532629155),\n", + " (u'describes', 1.2157428532629153),\n", + " (u'j', 1.2115310804189017),\n", + " (u'_stories_', 1.2049337080807629),\n", + " (u'september', 1.2030636155192291),\n", + " (u'boys', 1.1974364414369618),\n", + " (u'defendants', 1.1955861748361873),\n", + " (u'per', 1.1955861748361873),\n", + " (u'permission', 1.1955861748361873),\n", + " (u'uncle', 1.1955861748361873),\n", + " (u'thomas', 1.1924565577943991),\n", + " (u'trade', 1.1918333507609624),\n", + " (u'f', 1.1915163381561049),\n", + " (u'store', 1.189052998865439),\n", + " (u'notes', 1.1850922942502753),\n", + " (u'baker', 1.1828856976412236),\n", + " (u'baddeley', 1.1681694680548835),\n", + " (u'cogdal', 1.1681694680548835),\n", + " (u'copying', 1.1681694680548835),\n", + " (u'crafton', 1.1681694680548835),\n", + " (u'defect', 1.1681694680548835),\n", + " (u'donate', 1.1681694680548835),\n", + " (u'easier', 1.1681694680548835),\n", + " (u'editions', 1.1681694680548835),\n", + " (u'hawley', 1.1681694680548835),\n", + " (u'hitchcock', 1.1681694680548835),\n", + " (u'jake', 1.1681694680548835),\n", + " (u'jewelry', 1.1681694680548835),\n", + " (u'jurors', 1.1681694680548835),\n", + " (u'lightning', 1.1681694680548835),\n", + " (u'machine', 1.1681694680548835),\n", + " (u'paragraphs', 1.1681694680548835),\n", + " (u'pg', 1.1681694680548835),\n", + " (u'pork', 1.1681694680548835),\n", + " (u'retains', 1.1681694680548835),\n", + " (u'rod', 1.1681694680548835),\n", + " (u'securities', 1.1681694680548835),\n", + " (u'status', 1.1681694680548835),\n", + " (u'trousers', 1.1681694680548835),\n", + " (u'unpublished', 1.1681694680548835),\n", + " (u'berry', 1.1644932670010606),\n", + " (u'pp', 1.1608077284905565),\n", + " (u'hanks', 1.1587285139891437),\n", + " (u'mcclure', 1.1537352404836496),\n", + " (u'her', 1.1531891574151381),\n", + " (u'hamlin', 1.1529222466025137),\n", + " (u'speeches', 1.1437050469373577),\n", + " (u'kentucky', 1.1401563236722736),\n", + " (u'johnston', 1.1368073989967304),\n", + " (u'offutt', 1.1345503657246403),\n", + " (u'dress', 1.1343080503770544),\n", + " (u'german', 1.1343080503770544),\n", + " (u'matheney', 1.1343080503770544),\n", + " (u'company', 1.1298148326748745),\n", + " (u'g', 1.128517881924167),\n", + " (u'votes', 1.1187730676938106),\n", + " (u'nine', 1.113374076177045),\n", + " (u'charles', 1.1065580194728426),\n", + " (u'note', 1.0974655406391749),\n", + " (u'deed', 1.0970926363431248),\n", + " (u'east', 1.0970926363431248),\n", + " (u'spurious', 1.0970926363431248),\n", + " (u'atkinson', 1.0970926363431244),\n", + " (u'comply', 1.0970926363431244),\n", + " (u'jewelers', 1.0970926363431244),\n", + " (u'leland', 1.0970926363431244),\n", + " (u'priest', 1.0970926363431244),\n", + " (u'soldier', 1.0970926363431244),\n", + " (u'd', 1.0936709970367389),\n", + " (u'tax', 1.0890978328098568),\n", + " (u'colonel', 1.0886122317272675),\n", + " (u'pitcher', 1.0886122317272675),\n", + " (u'spink', 1.0886122317272675),\n", + " (u'charter', 1.0886122317272673),\n", + " (u'clock', 1.0886122317272673),\n", + " (u'distribute', 1.0886122317272673),\n", + " (u'fisher', 1.0886122317272673),\n", + " (u'convention', 1.0842245322470756),\n", + " (u'plaintiff', 1.0813648643938589),\n", + " (u'island', 1.0791112857988696),\n", + " (u'voyage', 1.0772490318253176),\n", + " (u'you', 1.0716742799027257),\n", + " (u'road', 1.0587290524017576),\n", + " (u'holland', 1.05373524048365),\n", + " (u'trailor', 1.0479900750043671),\n", + " (u'limited', 1.0447190713617185),\n", + " (u'domain', 1.0399471126066209),\n", + " (u'grandfather', 1.0399471126066209),\n", + " (u'voted', 1.0399471126066209),\n", + " (u'agree', 1.0367857078081339),\n", + " (u'including', 1.0367857078081339),\n", + " (u'life', 1.0279778291629844),\n", + " (u'witness', 1.0249646422762066),\n", + " (u'james', 1.0153080476245506),\n", + " (u'stuart', 1.0149104889383316),\n", + " (u'dungee', 1.0102738780733427),\n", + " (u'john', 1.0074378828094916),\n", + " (u'surveyor', 1.0071083505332288),\n", + " (u'cross', 1.0008479040802145),\n", + " (u'dollars', 1.0002448365299736),\n", + " (u'president', 0.99828026284480487),\n", + " (u'_amount_', 0.99450922395310026),\n", + " (u'_black', 0.99450922395310026),\n", + " (u'_commercial', 0.99450922395310026),\n", + " (u'_magazine', 0.99450922395310026),\n", + " (u'_nicolay', 0.99450922395310026),\n", + " (u'_north', 0.99450922395310026),\n", + " (u'_sun_', 0.99450922395310026),\n", + " (u'accompanies', 0.99450922395310026),\n", + " (u'accordance', 0.99450922395310026),\n", + " (u'adjourning', 0.99450922395310026),\n", + " (u'advertiser', 0.99450922395310026),\n", + " (u'advertiser_', 0.99450922395310026),\n", + " (u'agnosticism', 0.99450922395310026),\n", + " (u'almanacs', 0.99450922395310026),\n", + " (u'animals', 0.99450922395310026),\n", + " (u'apparel', 0.99450922395310026),\n", + " (u'appoints', 0.99450922395310026),\n", + " (u'arbitrations', 0.99450922395310026),\n", + " (u'ascii', 0.99450922395310026),\n", + " (u'asks', 0.99450922395310026),\n", + " (u'aspirants', 0.99450922395310026),\n", + " (u'atrocious', 0.99450922395310026),\n", + " (u'attachment', 0.99450922395310026),\n", + " (u'authors', 0.99450922395310026),\n", + " (u'band', 0.99450922395310026),\n", + " (u'bargained', 0.99450922395310026),\n", + " (u'bets', 0.99450922395310026),\n", + " (u'bleeding', 0.99450922395310026),\n", + " (u'boats', 0.99450922395310026),\n", + " (u'book_', 0.99450922395310026),\n", + " (u'boss', 0.99450922395310026),\n", + " (u'bourgeois', 0.99450922395310026),\n", + " (u'bull', 0.99450922395310026),\n", + " (u'calf', 0.99450922395310026),\n", + " (u'chase', 0.99450922395310026),\n", + " (u'chicanery', 0.99450922395310026),\n", + " (u'coach', 0.99450922395310026),\n", + " (u'coins', 0.99450922395310026),\n", + " (u'comet', 0.99450922395310026),\n", + " (u'computer', 0.99450922395310026),\n", + " (u'computers', 0.99450922395310026),\n", + " (u'concentration', 0.99450922395310026),\n", + " (u'conquering', 0.99450922395310026),\n", + " (u'conservator', 0.99450922395310026),\n", + " (u'contentedly', 0.99450922395310026),\n", + " (u'copied', 0.99450922395310026),\n", + " (u'cord', 0.99450922395310026),\n", + " (u'cornell', 0.99450922395310026),\n", + " (u'countenance', 0.99450922395310026),\n", + " (u'counting', 0.99450922395310026),\n", + " (u'countryman', 0.99450922395310026),\n", + " (u'creeks', 0.99450922395310026),\n", + " (u'davy', 0.99450922395310026),\n", + " (u'deer', 0.99450922395310026),\n", + " (u'def', 0.99450922395310026),\n", + " (u'delegations', 0.99450922395310026),\n", + " (u'deliveries', 0.99450922395310026),\n", + " (u'demurrer', 0.99450922395310026),\n", + " (u'desires', 0.99450922395310026),\n", + " (u'detriment', 0.99450922395310026),\n", + " (u'directors', 0.99450922395310026),\n", + " (u'disallows', 0.99450922395310026),\n", + " (u'disgracing', 0.99450922395310026),\n", + " (u'doctoring', 0.99450922395310026),\n", + " (u'effectively', 0.99450922395310026),\n", + " (u'elections', 0.99450922395310026),\n", + " (u'electronically', 0.99450922395310026),\n", + " (u'enrolling', 0.99450922395310026),\n", + " (u'exempt', 0.99450922395310026),\n", + " (u'faded', 0.99450922395310026),\n", + " (u'fares', 0.99450922395310026),\n", + " (u'ff', 0.99450922395310026),\n", + " (u'fights', 0.99450922395310026),\n", + " (u'flatboat', 0.99450922395310026),\n", + " (u'founded', 0.99450922395310026),\n", + " (u'generals', 0.99450922395310026),\n", + " (u'goose', 0.99450922395310026),\n", + " (u'greed', 0.99450922395310026),\n", + " (u'groomsman', 0.99450922395310026),\n", + " (u'hagerty', 0.99450922395310026),\n", + " (u'hans', 0.99450922395310026),\n", + " (u'harvard', 0.99450922395310026),\n", + " (u'haute', 0.99450922395310026),\n", + " (u'heel', 0.99450922395310026),\n", + " (u'history_', 0.99450922395310026),\n", + " (u'homeliest', 0.99450922395310026),\n", + " (u'howard', 0.99450922395310026),\n", + " (u'hut', 0.99450922395310026),\n", + " (u'ice', 0.99450922395310026),\n", + " (u'ida', 0.99450922395310026),\n", + " (u'identical', 0.99450922395310026),\n", + " (u'imperialist', 0.99450922395310026),\n", + " (u'independent', 0.99450922395310026),\n", + " (u'invalid', 0.99450922395310026),\n", + " (u'irons', 0.99450922395310026),\n", + " (u'janet', 0.99450922395310026),\n", + " (u'justification', 0.99450922395310026),\n", + " (u'lamborn', 0.99450922395310026),\n", + " (u'lambs', 0.99450922395310026),\n", + " (u'larceny', 0.99450922395310026),\n", + " (u'latin', 0.99450922395310026),\n", + " (u'linen', 0.99450922395310026),\n", + " (u'locations', 0.99450922395310026),\n", + " (u'louder', 0.99450922395310026),\n", + " (u'mad', 0.99450922395310026),\n", + " (u'magruder', 0.99450922395310026),\n", + " (u'maid', 0.99450922395310026),\n", + " (u'metaphysical', 0.99450922395310026),\n", + " (u'mit', 0.99450922395310026),\n", + " (u'monthlies', 0.99450922395310026),\n", + " (u'nest', 0.99450922395310026),\n", + " (u'nigger', 0.99450922395310026),\n", + " (u'package', 0.99450922395310026),\n", + " (u'pan', 0.99450922395310026),\n", + " (u'parentage', 0.99450922395310026),\n", + " (u'partial', 0.99450922395310026),\n", + " (u'partly', 0.99450922395310026),\n", + " (u'passengers', 0.99450922395310026),\n", + " (u'pension', 0.99450922395310026),\n", + " (u'pl', 0.99450922395310026),\n", + " (u'playful', 0.99450922395310026),\n", + " (u'population', 0.99450922395310026),\n", + " (u'postponed', 0.99450922395310026),\n", + " (u'postponement', 0.99450922395310026),\n", + " (u'premise', 0.99450922395310026),\n", + " (u'pressure', 0.99450922395310026),\n", + " (u'presumption', 0.99450922395310026),\n", + " (u'preventing', 0.99450922395310026),\n", + " (u'quart', 0.99450922395310026),\n", + " (u'quincy', 0.99450922395310026),\n", + " (u'quorum', 0.99450922395310026),\n", + " (u'redistribution', 0.99450922395310026),\n", + " (u'rejoicing', 0.99450922395310026),\n", + " (u'remit', 0.99450922395310026),\n", + " (u'rifle', 0.99450922395310026),\n", + " (u'romance', 0.99450922395310026),\n", + " (u'rothschild_', 0.99450922395310026),\n", + " (u'row', 0.99450922395310026),\n", + " (u'rubbish', 0.99450922395310026),\n", + " (u'sacrifices', 0.99450922395310026),\n", + " (u'scroll', 0.99450922395310026),\n", + " (u'shade', 0.99450922395310026),\n", + " (u'shed', 0.99450922395310026),\n", + " (u'sigh', 0.99450922395310026),\n", + " (u'silk', 0.99450922395310026),\n", + " (u'sinewy', 0.99450922395310026),\n", + " (u'sock', 0.99450922395310026),\n", + " (u'solicit', 0.99450922395310026),\n", + " (u'solvent', 0.99450922395310026),\n", + " (u'sonny', 0.99450922395310026),\n", + " (u'startling', 0.99450922395310026),\n", + " (u'steals', 0.99450922395310026),\n", + " (u'steamer', 0.99450922395310026),\n", + " (u'stevenson', 0.99450922395310026),\n", + " (u'subp\\u0153naed', 0.99450922395310026),\n", + " (u'tanned', 0.99450922395310026),\n", + " (u'tea', 0.99450922395310026),\n", + " (u'terre', 0.99450922395310026),\n", + " (u'theosophy', 0.99450922395310026),\n", + " (u'tight', 0.99450922395310026),\n", + " (u'tis', 0.99450922395310026),\n", + " (u'tour', 0.99450922395310026),\n", + " (u'vanilla', 0.99450922395310026),\n", + " (u'vol', 0.99450922395310026),\n", + " (u'warfare', 0.99450922395310026),\n", + " (u'warranty', 0.99450922395310026),\n", + " (u'wayne', 0.99450922395310026),\n", + " (u'whip', 0.99450922395310026),\n", + " (u'woodcut', 0.99450922395310026),\n", + " (u'wright', 0.99450922395310026),\n", + " (u'new', 0.99212250974463601)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "mz_keywords(text,scores=True,weighted=False,threshold=\"auto\")" ] diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index 6aa2ba04e6..d9607bd3c7 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -9,26 +9,22 @@ import numpy import scipy -def mz_keywords(text, - blocksize=1024, - scores=False, - split=False, - weighted=True, +def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters ---------- - text: str + text: str document to summarize - blocksize: int, optional + blocksize: int, optional size of blocks to use in analysis, default is 1024 - scores: bool, optional + scores: bool, optional Whether to return score with keywords, default is False - split: bool, optional + split: bool, optional Whether to return results as list, default is False - weighted: bool, optional + weighted: bool, optional Whether to weight scores by word frequency. Default is True. False can useful for shorter texts, and allows automatic thresholding threshold: float or 'auto', optional @@ -44,7 +40,7 @@ def mz_keywords(text, list of keywords if scores is False OR results: list(tuple(str, float)) list of (keyword, score) tuples if scores is True - + Results are returned in descending order of score regardless of the format. Notes @@ -85,21 +81,23 @@ def mz_keywords(text, threshold = nblocks / (nblocks + 1.0) + 1.0e-8 weights = [(word, score) for (word, score) in zip(vocab, H) - if score>threshold] - weights.sort(key = lambda x:-x[1]) + if score > threshold] + weights.sort(key=lambda x: -x[1]) result = weights if scores else [word for (word, score) in weights] if not (scores or split): result = '\n'.join(result) return result + def __log_combinations_inner(n, m): """Calculates the logarithm of n!/m!(n-m)!""" - return -(numpy.log(n+1)+scipy.special.betaln(n-m+1,m+1)) + return -(numpy.log(n + 1)+scipy.special.betaln(n - m + 1, m + 1)) + __log_combinations=numpy.frompyfunc(__log_combinations_inner, 2, 1) def __marginal_prob(blocksize, nwords): - def marginal_prob(n,m): + def marginal_prob(n, m): """Marginal probability of a word that occurs n times in the document occurring m times in a given block""" return numpy.exp(__log_combinations(n, m) @@ -107,12 +105,13 @@ def marginal_prob(n,m): - __log_combinations(nwords, blocksize)) return numpy.frompyfunc(marginal_prob, 2, 1) + def __analytic_entropy(blocksize, nblocks, nwords): - marginal=__marginal_prob(blocksize, nwords) + marginal = __marginal_prob(blocksize, nwords) def analytic_entropy(n): """Predicted entropy for a word that occurs n times in the document""" m = numpy.arange(1, min(blocksize, n) + 1).astype('d') - p = m/n - elements = p * numpy.nan_to_num(numpy.log2(p), 0.0) * marginal(n,m) + p = m / n + elements = numpy.nan_to_num(p * numpy.log2(p)) * marginal(n, m) return -nblocks * elements.sum() - return numpy.frompyfunc(analytic_entropy,1,1) \ No newline at end of file + return numpy.frompyfunc(analytic_entropy, 1, 1) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 12bc3c8d56..24521efc3a 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -155,6 +155,8 @@ def test_mz_keywords(self): text = f.read() kwds = mz_keywords(text) + self.assertTrue(kwds.startswith('film')) + self.assertTrue(kwds.endswith('sought')) self.assertTrue(len(kwds.splitlines())) kwds_u = mz_keywords(utils.to_unicode(text)) @@ -162,8 +164,10 @@ def test_mz_keywords(self): kwds_lst = mz_keywords(text, split=True) self.assertTrue(len(kwds_lst)) + kwds_auto = mz_keywords(text, scores=True, weighted=False, + threshold='auto') + self.assertTrue(kwds_auto[-1][1] > 329.0 / 330.0) - def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") From d9c290a267fcc6d6d63f1460ea000e08de8d0fcc Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 28 Nov 2017 10:54:50 +0000 Subject: [PATCH 09/26] Further flake8 issues --- gensim/summarization/mz_entropy.py | 7 +++++-- gensim/test/test_summarization.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index d9607bd3c7..8455a672d9 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -9,6 +9,7 @@ import numpy import scipy + def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): """Extract keywords from text using the Montemurro and Zanette entropy @@ -62,7 +63,7 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, text = to_unicode(text) words = [word for word in _tokenize_by_word(text)] vocab = sorted(set(words)) - wordcounts = numpy.array([[words[i:i+blocksize].count(word) + wordcounts = numpy.array([[words[i:i + blocksize].count(word) for word in vocab] for i in range(0, len(words), @@ -79,7 +80,7 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, H *= totals / nwords if threshold == 'auto': threshold = nblocks / (nblocks + 1.0) + 1.0e-8 - weights = [(word, score) + weights = [(word, score) for (word, score) in zip(vocab, H) if score > threshold] weights.sort(key=lambda x: -x[1]) @@ -100,6 +101,7 @@ def __marginal_prob(blocksize, nwords): def marginal_prob(n, m): """Marginal probability of a word that occurs n times in the document occurring m times in a given block""" + return numpy.exp(__log_combinations(n, m) + __log_combinations(nwords - n, blocksize - m) - __log_combinations(nwords, blocksize)) @@ -108,6 +110,7 @@ def marginal_prob(n, m): def __analytic_entropy(blocksize, nblocks, nwords): marginal = __marginal_prob(blocksize, nwords) + def analytic_entropy(n): """Predicted entropy for a word that occurs n times in the document""" m = numpy.arange(1, min(blocksize, n) + 1).astype('d') diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 24521efc3a..60f97e4e22 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -147,7 +147,7 @@ def test_keywords_runs(self): kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst)) - + def test_mz_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -164,7 +164,7 @@ def test_mz_keywords(self): kwds_lst = mz_keywords(text, split=True) self.assertTrue(len(kwds_lst)) - kwds_auto = mz_keywords(text, scores=True, weighted=False, + kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') self.assertTrue(kwds_auto[-1][1] > 329.0 / 330.0) From 8809e5ab5ed3e8ce39fe314e8968559d5f598edc Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 28 Nov 2017 11:38:41 +0000 Subject: [PATCH 10/26] Further flake8 issues --- gensim/summarization/mz_entropy.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index 8455a672d9..05862683f7 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -46,14 +46,14 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, Notes ----- - This algorithm looks for keywords that contribute to the structure of the - text on scales of blocksize words of larger. It is suitable for extracting + This algorithm looks for keywords that contribute to the structure of the + text on scales of blocksize words of larger. It is suitable for extracting keywords representing the major themes of long texts. References ---------- [1] Marcello A Montemurro, Damian Zanette, - "Towards the quantification of the semantic information encoded in + "Towards the quantification of the semantic information encoded in written language" Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153 DOI: 10.1142/S0219525910002530 @@ -81,8 +81,8 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, if threshold == 'auto': threshold = nblocks / (nblocks + 1.0) + 1.0e-8 weights = [(word, score) - for (word, score) in zip(vocab, H) - if score > threshold] + for (word, score) in zip(vocab, H) + if score > threshold] weights.sort(key=lambda x: -x[1]) result = weights if scores else [word for (word, score) in weights] if not (scores or split): @@ -92,12 +92,14 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, def __log_combinations_inner(n, m): """Calculates the logarithm of n!/m!(n-m)!""" - return -(numpy.log(n + 1)+scipy.special.betaln(n - m + 1, m + 1)) + return -(numpy.log(n + 1) + scipy.special.betaln(n - m + 1, m + 1)) -__log_combinations=numpy.frompyfunc(__log_combinations_inner, 2, 1) +__log_combinations = numpy.frompyfunc(__log_combinations_inner, 2, 1) + def __marginal_prob(blocksize, nwords): + def marginal_prob(n, m): """Marginal probability of a word that occurs n times in the document occurring m times in a given block""" @@ -105,6 +107,7 @@ def marginal_prob(n, m): return numpy.exp(__log_combinations(n, m) + __log_combinations(nwords - n, blocksize - m) - __log_combinations(nwords, blocksize)) + return numpy.frompyfunc(marginal_prob, 2, 1) @@ -117,4 +120,5 @@ def analytic_entropy(n): p = m / n elements = numpy.nan_to_num(p * numpy.log2(p)) * marginal(n, m) return -nblocks * elements.sum() + return numpy.frompyfunc(analytic_entropy, 1, 1) From a97fd828fc5fd5a4040437d20cc429aba6390068 Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 28 Nov 2017 11:42:23 +0000 Subject: [PATCH 11/26] Removed Jupyter checkpoint --- .../summarization_tutorial-checkpoint.ipynb | 512 ------------------ 1 file changed, 512 deletions(-) delete mode 100644 docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb diff --git a/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb b/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb deleted file mode 100644 index 20fdd925b0..0000000000 --- a/docs/notebooks/.ipynb_checkpoints/summarization_tutorial-checkpoint.ipynb +++ /dev/null @@ -1,512 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "

Tutorial: automatic summarization using Gensim

\n", - "\n", - "This module automatically summarizes the given text, by extracting one or more important sentences from the text. In a similar way, it can also extract keywords. This tutorial will teach you to use this summarization module via some examples. First, we will try a small example, then we will try two larger ones, and then we will review the performance of the summarizer in terms of speed.\n", - "\n", - "This summarizer is based on the \"TextRank\" algorithm, from an [article](http://web.eecs.umich.edu/%7Emihalcea/papers/mihalcea.emnlp04.pdf) by Mihalcea et al. This algorithm was later improved upon by Barrios et al. in another [article](https://raw.githubusercontent.com/summanlp/docs/master/articulo/articulo-en.pdf), by introducing something called a \"BM25 ranking function\". \n", - "\n", - "This tutorial assumes that you are familiar with Python and have [installed Gensim](http://radimrehurek.com/gensim/install.html).\n", - "\n", - "Note: Gensim's summarization only works for English for now, because the text is pre-processed so that stopwords are removed and the words are stemmed, and these processes are language-dependent.\n", - "\n", - "\n", - "

Small example

\n", - "\n", - "First of all, we import the function \"summarize\"." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,358 : INFO : Pattern library is not installed, lemmatization won't be available.\n", - "2016-09-19 12:45:22,361 : INFO : Could not import Theano, will use standard float for default ShardedCorpus dtype.\n", - "2016-09-19 12:45:22,372 : INFO : 'pattern' package not found; tag filters are not available for English\n" - ] - } - ], - "source": [ - "import logging\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", - "\n", - "from gensim.summarization import summarize" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will try summarizing a small toy example; later we will use a larger piece of text. In reality, the text is too small, but it suffices as an illustrative example." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Input text:\n", - "Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion. \n" - ] - } - ], - "source": [ - "text = \"Thomas A. Anderson is a man living two lives. By day he is an \" + \\\n", - " \"average computer programmer and by night a hacker known as \" + \\\n", - " \"Neo. Neo has always questioned his reality, but the truth is \" + \\\n", - " \"far beyond his imagination. Neo finds himself targeted by the \" + \\\n", - " \"police when he is contacted by Morpheus, a legendary computer \" + \\\n", - " \"hacker branded a terrorist by the government. Morpheus awakens \" + \\\n", - " \"Neo to the real world, a ravaged wasteland where most of \" + \\\n", - " \"humanity have been captured by a race of machines that live \" + \\\n", - " \"off of the humans' body heat and electrochemical energy and \" + \\\n", - " \"who imprison their minds within an artificial reality known as \" + \\\n", - " \"the Matrix. As a rebel against the machines, Neo must return to \" + \\\n", - " \"the Matrix and confront the agents: super-powerful computer \" + \\\n", - " \"programs devoted to snuffing out Neo and the entire human \" + \\\n", - " \"rebellion. \"\n", - "\n", - "print ('Input text:')\n", - "print (text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To summarize this text, we pass the raw string data as input to the function \"summarize\", and it will return a summary.\n", - "\n", - "Note: make sure that the string does not contain any newlines where the line breaks in a sentence. A sentence with a newline in it (i.e. a carriage return, \"\\n\") will be treated as two sentences." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,405 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,405 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,406 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,406 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" - ] - } - ], - "source": [ - "print ('Summary:')\n", - "print (summarize(text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the \"split\" option if you want a list of strings instead of a single string." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,428 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,428 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,429 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,430 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\"Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\"]\n" - ] - } - ], - "source": [ - "print (summarize(text, split=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can adjust how much text the summarizer outputs via the \"ratio\" parameter or the \"word_count\" parameter. Using the \"ratio\" parameter, you specify what fraction of sentences in the original text should be returned as output. Below we specify that we want 50% of the original text (the default is 20%)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,446 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,446 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,447 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,447 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.\n", - "Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government.\n", - "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" - ] - } - ], - "source": [ - "print ('Summary:')\n", - "print (summarize(text, ratio=0.5))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using the \"word_count\" parameter, we specify the maximum amount of words we want in the summary. Below we have specified that we want no more than 50 words." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,463 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,464 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,464 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,465 : WARNING : Input corpus is expected to have at least 10 documents.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.\n" - ] - } - ], - "source": [ - "print ('Summary:')\n", - "print (summarize(text, word_count=50))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned earlier, this module also supports keyword extraction. Keyword extraction works in the same way as summary generation (i.e. sentence extraction), in that the algorithm tries to find words that are important or seem representative of the entire text. They keywords are not always single words; in the case of multi-word keywords, they are typically all nouns." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Keywords:\n", - "humanity\n", - "human\n", - "neo\n", - "humans body\n", - "super\n", - "reality\n", - "hacker\n" - ] - } - ], - "source": [ - "from gensim.summarization import keywords\n", - "\n", - "print ('Keywords:')\n", - "print (keywords(text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "

Larger example

\n", - "\n", - "Let us try an example with a larger piece of text. We will be using a synopsis of the movie \"The Matrix\", which we have taken from [this](http://www.imdb.com/title/tt0133093/synopsis?ref_=ttpl_pl_syn) IMDb page.\n", - "\n", - "In the code below, we read the text file directly from a web-page using \"requests\". Then we produce a summary and some keywords." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:22,510 : INFO : Starting new HTTP connection (1): rare-technologies.com\n", - "2016-09-19 12:45:23,035 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:23,042 : INFO : built Dictionary(1093 unique tokens: ['realiti', 'keanu', 'miseri', 'vestig', 'massiv']...) from 416 documents (total 2985 corpus positions)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Summary:\n", - "Anderson, a software engineer for a Metacortex, the other life as Neo, a computer hacker \"guilty of virtually every computer crime we have a law for.\" Agent Smith asks him to help them capture Morpheus, a dangerous terrorist, in exchange for amnesty.\n", - "Morpheus explains that he's been searching for Neo his entire life and asks if Neo feels like \"Alice in Wonderland, falling down the rabbit hole.\" He explains to Neo that they exist in the Matrix, a false reality that has been constructed for humans to hide the truth.\n", - "Neo is introduced to Morpheus's crew including Trinity; Apoc (Julian Arahanga), a man with long, flowing black hair; Switch; Cypher (bald with a goatee); two brawny brothers, Tank (Marcus Chong) and Dozer (Anthony Ray Parker); and a young, thin man named Mouse (Matt Doran).\n", - "Cypher cuts up a juicy steak and ruminates that he knows the steak is merely the simulation telling his brain that it is delicious and juicy, but after nine years he has discovered that \"ignorance is bliss.\" He strikes a deal for the machines to reinsert his body into a power plant, reinsert him into the Matrix, and he'll help the Agents.\n", - "\n", - "Keywords:\n", - "neo\n", - "morpheus\n", - "trinity\n", - "cypher\n", - "agents\n", - "agent\n", - "smith\n", - "tank\n", - "says\n", - "saying\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text\n", - "\n", - "print ('Summary:')\n", - "print (summarize(text, ratio=0.01))\n", - "\n", - "print ('\\nKeywords:')\n", - "print (keywords(text, ratio=0.01))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you know this movie, you see that this summary is actually quite good. We also see that some of the most important characters (Neo, Morpheus, Trinity) were extracted as keywords.\n", - "\n", - "

Another example

\n", - "\n", - "Let's try an example similar to the one above. This time, we will use the [IMDb synopsis](http://www.imdb.com/title/tt0118715/synopsis?ref_=tt_stry_pl) of \"The Big Lebowski\".\n", - "\n", - "Again, we download the text and produce a summary and some keywords." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2016-09-19 12:45:25,227 : INFO : Starting new HTTP connection (1): rare-technologies.com\n" - ] - } - ], - "source": [ - "import requests\n", - "\n", - "text = requests.get('http://rare-technologies.com/the_big_lebowski_synopsis.txt').text\n", - "\n", - "print ('Summary:')\n", - "print (summarize(text, ratio=0.01))\n", - "\n", - "print ('\\nKeywords:')\n", - "print (keywords(text, ratio=0.01))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This time around, the summary is not of high quality, as it does not tell us much about the movie. In a way, this might not be the algorithms fault, rather this text simply doesn't contain one or two sentences that capture the essence of the text as in \"The Matrix\" synopsis.\n", - "\n", - "The keywords, however, managed to find some of the main characters.\n", - "\n", - "

Performance

\n", - "\n", - "We will test how the speed of the summarizer scales with the size of the dataset. These tests were run on an Intel Core i5 4210U CPU @ 1.70 GHz x 4 processor. Note that the summarizer does not support multithreading (parallel processing).\n", - "\n", - "The tests were run on the book \"Honest Abe\" by Alonzo Rothschild. Download the book in plain-text here. \n", - "\n", - "In the plot below, we see the running times together with the sizes of the datasets. To create datasets of different sizes, we have simply taken prefixes of text; in other words we take the first n characters of the book. The algorithm seems to be quadratic in time, so one needs to be careful before plugging a large dataset into the summarizer.\n", - "\n", - "
\n", - "\n", - "
\n", - "
\n", - "\n", - "

Text-content dependent running times

\n", - "\n", - "The running time is not only dependent on the size of the dataset. For example, summarizing \"The Matrix\" synopsis (about 36,000 characters) takes about 3.1 seconds, while summarizing 35,000 characters of this book takes about 8.5 seconds. So the former is more than twice as fast. \n", - "\n", - "One reason for this difference in running times is the data structure that is used. The algorithm represents the data using a graph, where vertices (nodes) are sentences, and then constructs weighted edges between the vertices that represent how the sentences relate to each other. This means that every piece of text will have a different graph, thus making the running times different. The size of this data structure is quadratic in the worst case (the worst case is when each vertex has an edge to every other vertex).\n", - "\n", - "Another possible reason for the difference in running times is that the problems converge at different rates, meaning that the error drops slower for some datasets than for others.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Montemurro and Zanette's entropy based keyword extraction algorithm\n", - "\n", - "[This paper](https://arxiv.org/abs/0907.1558) describes a technique to identify words that play a significant role in the large-scale structure of a text. These typically correspond to the major themes of the text. The text is divided into blocks of ~1000 words, and the entropy of each word's distribution amongst the blocks is\n", - "caclulated and compared with the expected entropy if the word were distributed randomly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import requests\n", - "from gensim.summarization import mz_keywords" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "text=requests.get(\"http://www.gutenberg.org/files/49679/49679-0.txt\").text\n", - "mz_keywords(text,scores=True,threshold=0.005)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, the algorithm weights the entropy by the overall frequency of the word in the document. We can remove this weighting by setting weighted=False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "mz_keywords(text,scores=True,weighted=False,threshold=1.0)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When this option is used, it is possible to calculate a threshold automatically from the number of blocks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "mz_keywords(text,scores=True,weighted=False,threshold=\"auto\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The complexity of the algorithm is **O**(*Nw*), where *N* is the number of words in the document and *w* is the number of unique words." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 0d4e31c10c605c6de9649798fe9df0901648ba19 Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 28 Nov 2017 13:32:18 +0000 Subject: [PATCH 12/26] Removed trailing whitespace --- gensim/summarization/mz_entropy.py | 2 +- gensim/test/test_summarization.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index 05862683f7..026a1ab21a 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -120,5 +120,5 @@ def analytic_entropy(n): p = m / n elements = numpy.nan_to_num(p * numpy.log2(p)) * marginal(n, m) return -nblocks * elements.sum() - + return numpy.frompyfunc(analytic_entropy, 1, 1) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 60f97e4e22..b1a4386091 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -167,7 +167,7 @@ def test_mz_keywords(self): kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') self.assertTrue(kwds_auto[-1][1] > 329.0 / 330.0) - + def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") From 4d18223c3da6e078af3b19df0f604e5cbf3969e5 Mon Sep 17 00:00:00 2001 From: Pete Date: Tue, 28 Nov 2017 14:26:29 +0000 Subject: [PATCH 13/26] Trailing whitespace --- gensim/summarization/mz_entropy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index 026a1ab21a..2d18ad82f9 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -12,7 +12,7 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): - """Extract keywords from text using the Montemurro and Zanette entropy + """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters From dc42cee1fb8cf991830123ff91a6cd97d076d35c Mon Sep 17 00:00:00 2001 From: Pete Date: Wed, 29 Nov 2017 09:10:35 +0000 Subject: [PATCH 14/26] Speed up test and add comment to explain threshold value --- gensim/test/test_summarization.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index b1a4386091..10a38c1ad0 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -152,21 +152,20 @@ def test_mz_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f: - text = f.read() - + text = utils.to_unicode(f.read()) + text=u' '.join(text.split()[:10240]) kwds = mz_keywords(text) - self.assertTrue(kwds.startswith('film')) - self.assertTrue(kwds.endswith('sought')) + self.assertTrue(kwds.startswith('autism')) + self.assertTrue(kwds.endswith('uk')) self.assertTrue(len(kwds.splitlines())) - kwds_u = mz_keywords(utils.to_unicode(text)) - self.assertTrue(len(kwds_u.splitlines())) - kwds_lst = mz_keywords(text, split=True) self.assertTrue(len(kwds_lst)) + # Automatic thresholding selects words with nblocks/nblocks+1 + # bits of entropy. For this text, nblocks=10 kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') - self.assertTrue(kwds_auto[-1][1] > 329.0 / 330.0) + self.assertTrue(kwds_auto[-1][1] > 10.0 / 11.0) def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") From fdddf02ced5280ae73e97d5e0c7ec412be4a202d Mon Sep 17 00:00:00 2001 From: Pete Date: Wed, 29 Nov 2017 09:35:46 +0000 Subject: [PATCH 15/26] Flake8 again --- gensim/test/test_summarization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 10a38c1ad0..752f71d9fb 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -153,7 +153,7 @@ def test_mz_keywords(self): with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f: text = utils.to_unicode(f.read()) - text=u' '.join(text.split()[:10240]) + text = u' '.join(text.split()[:10240]) kwds = mz_keywords(text) self.assertTrue(kwds.startswith('autism')) self.assertTrue(kwds.endswith('uk')) From 86db65cd561fbd9c031f8b665bfd1a25c5fea527 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 30 Nov 2017 11:42:36 +0500 Subject: [PATCH 16/26] rename vars + style fixes --- gensim/summarization/mz_entropy.py | 62 +++++++++++++++--------------- gensim/test/test_summarization.py | 10 ++--- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/gensim/summarization/mz_entropy.py b/gensim/summarization/mz_entropy.py index 2d18ad82f9..b9c5c02f33 100644 --- a/gensim/summarization/mz_entropy.py +++ b/gensim/summarization/mz_entropy.py @@ -10,8 +10,7 @@ import scipy -def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, - threshold=0.0): +def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ @@ -30,17 +29,17 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, False can useful for shorter texts, and allows automatic thresholding threshold: float or 'auto', optional minimum score for returned keywords, default 0.0 - 'auto' calculates the threshold as nblocks / (nblocks + 1.0) + 1.0e-8 + 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1.0e-8 Use 'auto' with weighted=False) Returns ------- results: str - newline separated keywords if split is False OR + newline separated keywords if `split` == False OR results: list(str) - list of keywords if scores is False OR + list of keywords if `scores` == False OR results: list(tuple(str, float)) - list of (keyword, score) tuples if scores is True + list of (keyword, score) tuples if `scores` == True Results are returned in descending order of score regardless of the format. @@ -63,26 +62,25 @@ def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, text = to_unicode(text) words = [word for word in _tokenize_by_word(text)] vocab = sorted(set(words)) - wordcounts = numpy.array([[words[i:i + blocksize].count(word) - for word in vocab] - for i in range(0, - len(words), - blocksize)]).astype('d') - nblocks = wordcounts.shape[0] - totals = wordcounts.sum(axis=0) - nwords = totals.sum() - p = wordcounts / totals - logp = numpy.log2(p) - H = numpy.nan_to_num(p * logp).sum(axis=0) - analytic = __analytic_entropy(blocksize, nblocks, nwords) - H += analytic(totals).astype('d') + word_counts = numpy.array( + [ + [words[i:i + blocksize].count(word) for word in vocab] + for i in range(0, len(words), blocksize) + ] + ).astype('d') + n_blocks = word_counts.shape[0] + totals = word_counts.sum(axis=0) + n_words = totals.sum() + p = word_counts / totals + log_p = numpy.log2(p) + h = numpy.nan_to_num(p * log_p).sum(axis=0) + analytic = __analytic_entropy(blocksize, n_blocks, n_words) + h += analytic(totals).astype('d') if weighted: - H *= totals / nwords + h *= totals / n_words if threshold == 'auto': - threshold = nblocks / (nblocks + 1.0) + 1.0e-8 - weights = [(word, score) - for (word, score) in zip(vocab, H) - if score > threshold] + threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8 + weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold] weights.sort(key=lambda x: -x[1]) result = weights if scores else [word for (word, score) in weights] if not (scores or split): @@ -98,27 +96,29 @@ def __log_combinations_inner(n, m): __log_combinations = numpy.frompyfunc(__log_combinations_inner, 2, 1) -def __marginal_prob(blocksize, nwords): +def __marginal_prob(blocksize, n_words): def marginal_prob(n, m): """Marginal probability of a word that occurs n times in the document occurring m times in a given block""" - return numpy.exp(__log_combinations(n, m) - + __log_combinations(nwords - n, blocksize - m) - - __log_combinations(nwords, blocksize)) + return numpy.exp( + __log_combinations(n, m) + + __log_combinations(n_words - n, blocksize - m) - + __log_combinations(n_words, blocksize) + ) return numpy.frompyfunc(marginal_prob, 2, 1) -def __analytic_entropy(blocksize, nblocks, nwords): - marginal = __marginal_prob(blocksize, nwords) +def __analytic_entropy(blocksize, n_blocks, n_words): + marginal = __marginal_prob(blocksize, n_words) def analytic_entropy(n): """Predicted entropy for a word that occurs n times in the document""" m = numpy.arange(1, min(blocksize, n) + 1).astype('d') p = m / n elements = numpy.nan_to_num(p * numpy.log2(p)) * marginal(n, m) - return -nblocks * elements.sum() + return -n_blocks * elements.sum() return numpy.frompyfunc(analytic_entropy, 1, 1) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 752f71d9fb..e72b99cb1c 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -161,11 +161,11 @@ def test_mz_keywords(self): kwds_lst = mz_keywords(text, split=True) self.assertTrue(len(kwds_lst)) - # Automatic thresholding selects words with nblocks/nblocks+1 - # bits of entropy. For this text, nblocks=10 - kwds_auto = mz_keywords(text, scores=True, weighted=False, - threshold='auto') - self.assertTrue(kwds_auto[-1][1] > 10.0 / 11.0) + # Automatic thresholding selects words with n_blocks / n_blocks+1 + # bits of entropy. For this text, n_blocks=10 + n_blocks = 10. + kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') + self.assertTrue(kwds_auto[-1][1] > (n_blocks / n_blocks + 1.)) def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") From 28ae7cb2e9cc75167d1aadd24fa77b0d8e46cec6 Mon Sep 17 00:00:00 2001 From: Menshikh Ivan Date: Thu, 30 Nov 2017 12:34:08 +0500 Subject: [PATCH 17/26] fix operation order --- gensim/test/test_summarization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index e72b99cb1c..45c0bce8ee 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -165,7 +165,7 @@ def test_mz_keywords(self): # bits of entropy. For this text, n_blocks=10 n_blocks = 10. kwds_auto = mz_keywords(text, scores=True, weighted=False, threshold='auto') - self.assertTrue(kwds_auto[-1][1] > (n_blocks / n_blocks + 1.)) + self.assertTrue(kwds_auto[-1][1] > (n_blocks / (n_blocks + 1.))) def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") From 6add3bac515d9e27c0d0bb9e251b980c185c8951 Mon Sep 17 00:00:00 2001 From: Pete Date: Wed, 3 Jan 2018 21:47:04 +0000 Subject: [PATCH 18/26] Update docs with Montemurro and Zanette's algorithm --- docs/src/apiref.rst | 1 + docs/src/summarization/mz_entropy.rst | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 docs/src/summarization/mz_entropy.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index b781ef16d6..d88db20bdf 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -87,6 +87,7 @@ Modules: summarization/commons summarization/graph summarization/keywords + summarization/mz_entropy summarization/pagerank_weighted summarization/summariser summarization/syntactic_unit diff --git a/docs/src/summarization/mz_entropy.rst b/docs/src/summarization/mz_entropy.rst new file mode 100644 index 0000000000..831ea11088 --- /dev/null +++ b/docs/src/summarization/mz_entropy.rst @@ -0,0 +1,9 @@ +:mod:`summarization.mz_entropy` -- Keywords from the Montemurro and Zanette algorithm +====================================================================================== + +.. automodule:: gensim.summarization.mz_entropy + :synopsis: Keywords from the Monemurro and Zanette algorithm + :members: + :inherited-members: + :undoc-members: + :show-inheritance: From 86590bb547021a8745fc6dd4df4c851de0186330 Mon Sep 17 00:00:00 2001 From: Pete Date: Thu, 4 Jan 2018 20:09:56 +0000 Subject: [PATCH 19/26] Revert "Update docs with Montemurro and Zanette's algorithm" This reverts commit 6add3bac515d9e27c0d0bb9e251b980c185c8951. --- docs/src/apiref.rst | 1 - docs/src/summarization/mz_entropy.rst | 9 --------- 2 files changed, 10 deletions(-) delete mode 100644 docs/src/summarization/mz_entropy.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index d88db20bdf..b781ef16d6 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -87,7 +87,6 @@ Modules: summarization/commons summarization/graph summarization/keywords - summarization/mz_entropy summarization/pagerank_weighted summarization/summariser summarization/syntactic_unit diff --git a/docs/src/summarization/mz_entropy.rst b/docs/src/summarization/mz_entropy.rst deleted file mode 100644 index 831ea11088..0000000000 --- a/docs/src/summarization/mz_entropy.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.mz_entropy` -- Keywords from the Montemurro and Zanette algorithm -====================================================================================== - -.. automodule:: gensim.summarization.mz_entropy - :synopsis: Keywords from the Monemurro and Zanette algorithm - :members: - :inherited-members: - :undoc-members: - :show-inheritance: From bdc1a6d3602f363903b111b7860b138d07afb587 Mon Sep 17 00:00:00 2001 From: Pete Date: Fri, 6 Apr 2018 11:48:04 +0100 Subject: [PATCH 20/26] Fixed bug in TfidfModel, as described in Issue #2020 --- gensim/models/tfidfmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a9e12c995a..b6e59c3d12 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -167,7 +167,7 @@ def updated_wglobal(docfreq, totaldocs, n_df): """ if n_df == "n": - return utils.identity(docfreq) + return 1 elif n_df == "t": return np.log(1.0 * totaldocs / docfreq) / np.log(2) elif n_df == "p": From 590b52a3746e836ce9d8f9123d27d77c0b17d435 Mon Sep 17 00:00:00 2001 From: Menshikh Ivan Date: Fri, 6 Apr 2018 15:56:13 +0500 Subject: [PATCH 21/26] Fix return type --- gensim/models/tfidfmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index b6e59c3d12..2e49427ce6 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -167,7 +167,7 @@ def updated_wglobal(docfreq, totaldocs, n_df): """ if n_df == "n": - return 1 + return 1. elif n_df == "t": return np.log(1.0 * totaldocs / docfreq) / np.log(2) elif n_df == "p": From 3b64a780094a0a0209eda48d0bd99f52873ec54d Mon Sep 17 00:00:00 2001 From: Pete Date: Fri, 6 Apr 2018 13:31:02 +0100 Subject: [PATCH 22/26] Updated unit tests for TfidfModel --- gensim/test/test_tfidfmodel.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index b15e892c9c..189f998a57 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -182,10 +182,7 @@ def test_consistency(self): # nnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn') transformed_docs = [model[docs[0]], model[docs[1]]] - expected_docs = [ - [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], - [(5, 6), (9, 3), (10, 3)] - ] + expected_docs = docs[:] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -194,8 +191,8 @@ def test_consistency(self): model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ - [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], - [(5, 6.0), (9, 3.0), (10, 3.0)] + [(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], + [(5, 2.0), (9, 1.0), (10, 1.0)] ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) @@ -205,8 +202,8 @@ def test_consistency(self): model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ - [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], - [(5, 3.0), (9, 2.25), (10, 2.25)] + [(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], + [(5, 1.0), (9, 0.75), (10, 0.75)] ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) @@ -216,8 +213,8 @@ def test_consistency(self): model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ - [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], - [(5, 3), (9, 3), (10, 3)] + [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], + [(5, 1), (9, 1), (10, 1)] ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) @@ -228,8 +225,8 @@ def test_consistency(self): transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [ - (3, 1.4635792826230198), (4, 1.4635792826230198), (5, 2.19536892393453), (6, 1.4635792826230198), - (7, 2.19536892393453), (8, 1.4635792826230198) + (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), + (7, 1.0), (8, 1.0) ], [ (5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055) @@ -249,7 +246,7 @@ def test_consistency(self): (7, 1.5849625007211563), (8, 2.1699250014423126) ], [ - (5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563) + (5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526) ] ] @@ -278,8 +275,8 @@ def test_consistency(self): transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [ - (3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), - (7, 0.51449575542752646), (8, 0.34299717028501764) + (3, 0.4082482905), (4, 0.4082482905), (5, 0.4082482905), (6, 0.4082482905), + (7, 0.4082482905), (8, 0.4082482905) ], [ (5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302) From 5f9aa9389e51cb9d0558e168ccce19160fdd7305 Mon Sep 17 00:00:00 2001 From: Pete Date: Fri, 6 Apr 2018 14:28:44 +0100 Subject: [PATCH 23/26] Updated unit tests for TfidfModel --- gensim/test/test_tfidfmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index 189f998a57..9876c00a57 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -229,7 +229,7 @@ def test_consistency(self): (7, 1.0), (8, 1.0) ], [ - (5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055) + (5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526) ] ] From fdab1e81245de46b972fbd1afc5aeaff7ced05d9 Mon Sep 17 00:00:00 2001 From: Pete Date: Sat, 7 Apr 2018 22:43:24 +0100 Subject: [PATCH 24/26] Changed log(x)/log(2) to log2(x) since this is clearer. Fixed the placement of a parenthesis. Updated predicted values for unit tests --- gensim/corpora/dictionary.py | 1 - gensim/models/tfidfmodel.py | 12 +++++--- gensim/test/test_data/tfidf_model.tst | Bin 458 -> 1261 bytes gensim/test/test_tfidfmodel.py | 40 ++++++++++++++------------ 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index a736849b4e..9a9edf24ed 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -245,7 +245,6 @@ def doc2bow(self, document, allow_update=False, return_missing=False): # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) - result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id} if allow_update: diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 2e49427ce6..7e93e7c859 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -118,6 +118,7 @@ def precompute_idfs(wglobal, dfs, total_docs): # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} + def updated_wlocal(tf, n_tf): @@ -139,13 +140,13 @@ def updated_wlocal(tf, n_tf): if n_tf == "n": return tf elif n_tf == "l": - return 1 + np.log(tf) / np.log(2) + return 1 + np.log2(tf) elif n_tf == "a": return 0.5 + (0.5 * tf / tf.max(axis=0)) elif n_tf == "b": return tf.astype('bool').astype('int') elif n_tf == "L": - return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) + return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0))) def updated_wglobal(docfreq, totaldocs, n_df): @@ -166,12 +167,13 @@ def updated_wglobal(docfreq, totaldocs, n_df): Calculated wglobal. """ + if n_df == "n": return 1. elif n_df == "t": - return np.log(1.0 * totaldocs / docfreq) / np.log(2) + return np.log2(1.0 * totaldocs / docfreq) elif n_df == "p": - return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) + return max(0,np.log2((1.0 * totaldocs - docfreq) / docfreq)) def updated_normalize(x, n_n, return_norm=False): @@ -295,6 +297,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None + self.smartirs = smartirs self.slope = slope self.pivot = pivot @@ -371,6 +374,7 @@ def initialize(self, corpus): numnnz += len(bow) for termid, _ in bow: dfs[termid] = dfs.get(termid, 0) + 1 + # keep some stats about the training corpus self.num_docs = docno + 1 diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst index e9e5f3f3cff5372e7e5ce18a89efe58321c84e4f..73f6cf2be5ade9e0542b41b34898abf42fcece50 100644 GIT binary patch literal 1261 zcma)+*;3R%6ozMCGPr<(8}8enG74^}Ah@EmjM}0`WsI35Gl^!B`O_IFsG2IhAU=ZS zl@~sR7vA8lzJ*u5gmyQSWtEjGlInEl^q2E@PMve!ri`ZLF{f2&`KINvO4xABhFmLi zVmT|!3KB!_knAE=R?~?@!fG?5rzh~?8`VE zA!jr?o)P-K%apbrgpSTV=HrDsZBq{|Q~T>dL16_&3Z-lphPo)pLwK5p9yQ683~XZ6 z5S#02p1(Qf=%OE(*iuz9^0mRRwTs>|Nf`2s6~hm#N~`1Yg9L%Tq%cEw^*}-2+P-3j zOKnTRHZ{e+Z;W9(Ns#1w^ivF!Nm@#dV@I`Go%zi#`5=i9JL`sK@NgMSu{^O*uuCm* z7xwDc)laLRlk>?b>{fH!;_9K}jbKlNL5jT*_EGF7NxRP;u>b$$fX%ss6o(=lrZ_@U zG2BtX9iuoNp-eGE(lOi#!JVWy6=9g7LNYPjX~CVL7>O`Sah7D)xt@Q2>D!k{oD<-A ziVG1gQjC#YjKfQUyG(HE zM~06qU*-6m=W~J2MLsKhlt`+hW_e88&VnD}Syk2(&s5Fy4Tk5WDARhe>H2lu#S41_ zm9q{jm|sDS;$_*M=XImWg77r6fmbq%L+GM#>UF8lT)%C>n4bLkU0gq8rY^cZuMU_S zQkAR%YXuF8X4$s5sJ$*4HsDB653hyjo4VT1e_GQ5%jg8mS+F#wFIcXxn+$FZEme`* wPLsC*i`%>!d>I`aZ$wrayOkv;=Eg~@R;mOWZ^6KG@L+6Q zBft}~j_&89W6AeChs{cMR<&hogw(bimP#zYnLIeWb2fqC`US#0L{pY`rf+Mlb`s)O zmT}&V?bFu6q+&7VEhKwLr}SWc8$1T@Zy$;V9z&0j$Jpa!e)IYBEb(sY-ORhWcZtX1 zFEFwU|Hg%BEMzoisk?RE4sv?ME;;XIGxk=kCAaljYEugfq+*0~wZbJOUW Date: Sun, 8 Apr 2018 20:40:30 +0100 Subject: [PATCH 25/26] Fixed persistence tests --- gensim/models/tfidfmodel.py | 11 ++--------- gensim/test/test_data/tfidf_model.tst | Bin 1261 -> 1261 bytes gensim/test/test_data/tfidf_model.tst.bz2 | Bin 338 -> 822 bytes gensim/test/test_tfidfmodel.py | 17 ++++++++++++----- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 7e93e7c859..d6c7a2efaf 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -96,7 +96,6 @@ def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): """ return add + np.log(float(totaldocs) / docfreq) / np.log(log_base) - def precompute_idfs(wglobal, dfs, total_docs): """Pre-compute the inverse document frequency mapping for all terms. @@ -118,8 +117,6 @@ def precompute_idfs(wglobal, dfs, total_docs): # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} - - def updated_wlocal(tf, n_tf): """A scheme to transform `tf` or term frequency based on the value of `n_tf`. @@ -173,7 +170,7 @@ def updated_wglobal(docfreq, totaldocs, n_df): elif n_df == "t": return np.log2(1.0 * totaldocs / docfreq) elif n_df == "p": - return max(0,np.log2((1.0 * totaldocs - docfreq) / docfreq)) + return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq)) def updated_normalize(x, n_n, return_norm=False): @@ -296,8 +293,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize - self.num_docs, self.num_nnz, self.idfs = None, None, None - + self.num_docs, self.num_nnz, self.idfs = None, None, None self.smartirs = smartirs self.slope = slope self.pivot = pivot @@ -306,7 +302,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden # If smartirs is not None, override wlocal, wglobal and normalize if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) - self.wlocal = partial(updated_wlocal, n_tf=n_tf) self.wglobal = partial(updated_wglobal, n_df=n_df) # also return norm factor if pivot is not none @@ -374,8 +369,6 @@ def initialize(self, corpus): numnnz += len(bow) for termid, _ in bow: dfs[termid] = dfs.get(termid, 0) + 1 - - # keep some stats about the training corpus self.num_docs = docno + 1 self.num_nnz = numnnz diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst index 73f6cf2be5ade9e0542b41b34898abf42fcece50..8d3c60c73ed35f9d341a1a9e65fd490301539739 100644 GIT binary patch literal 1261 zcma)++fvg&7{`00ZHfm}JRlxXDJoU)gbIR3l$EM0YE)`WleCGXN&ndt24RWi_OX%*>ah!27!)7+S+5PQz-n>m24a;Lrv)uGe%Vp)T?wEBsSLVcY zRxB$>4822g6{(V%OC%CryQz8J3eu!sP4jQF&X8F}wnP#nMUuRWC(V0?_bi`v@iE88 zJRb{uRCq6vR9nsR<1J^=57AXo75<@VPQ&v9%Vfw^kyi^spC2@J*IBVpFzRj32z}pW zN=pwyN9V>V@jkR$rXE_R_E&;};u^Xswv_DR&?>e{`#sIWHZ{pL8Q9J$A$n@kPsbcx z@&gk)DiX6=3_GVM@ZsaT;a@#!ue3I>jCR1BMN4D)qUHKJH+)wWy=q==^H8;x8Fp8( zN3Mv_nZCiWmlULqrH1R*bQk;VO}Np474)rPKSh7Z?&D4zP%|P(npwv|=|zY`!i&Q- zxxur@aAXxnd3p?KPubuBE;rjQk6#ez%L+4eR}T~nZ0sv$xZJW798*(VaD5ENd190A zae`v7MAA~nI7*dDW#%_;@z~#?C+Cw>IHh*+ z6;}@(Zv>|!3{#XNoS`^NlJ8`VE zA!jr?o)P-K%apbrgpSTV=HrDsZBq{|Q~T>dL16_&3Z-lphPo)pLwK5p9yQ683~XZ6 z5S#02p1(Qf=%OE(*iuz9^0mRRwTs>|Nf`2s6~hm#N~`1Yg9L%Tq%cEw^*}-2+P-3j zOKnTRHZ{e+Z;W9(Ns#1w^ivF!Nm@#dV@I`Go%zi#`5=i9JL`sK@NgMSu{^O*uuCm* z7xwDc)laLRlk>?b>{fH!;_9K}jbKlNL5jT*_EGF7NxRP;u>b$$fX%ss6o(=lrZ_@U zG2BtX9iuoNp-eGE(lOi#!JVWy6=9g7LNYPjX~CVL7>O`Sah7D)xt@Q2>D!k{oD<-A ziVG1gQjC#YjKfQUyG(HE zM~06qU*-6m=W~J2MLsKhlt`+hW_e88&VnD}Syk2(&s5Fy4Tk5WDARhe>H2lu#S41_ zm9q{jm|sDS;$_*M=XImWg77r6fmbq%L+GM#>UF8lT)%C>n4bLkU0gq8rY^cZuMU_S zQkAR%YXuF8X4$s5sJ$*4HsDB653hyjo4VT1e_GQ5%jg8mS+F#wFIcXxn+$FZEme`* wPLsC*i`%>!d>I0iguRHi~%x zA?h?~p^>#T$)GeeXaE2MLn8zLWMmp-4Ff_df0U{Enw!-F)Mx+z0004?0000001W^% zGy$LiibXU44FDPd27mwn00000001-q02vN9)ipFWKm-7iISr==DeZtoswe^=J%_3~ z0RS^3l!ka3U4JX`ly^4FHH~1{4^;n1ulaz#yUm2to+vQ`CO>VQBO|C{`Rn zq9aWt13)4lAQ*^k7SSvuAQ2(WHCu9SoxE++w~_6_5)ed%=@kG3Twol|0PI~Tj-*5$ zlMmPoWXJ&nODMo6Fv5Zi(NhK|2g$uk8Y|o;qx2&$j#+4HolcJ6e|@L> zEci%H0Fs%3jsPh@EGcs@R|P;?sw=X(QioL0LLqT)Zt(DlvtElFONUOqFB#Q<%9R{; zc~@lSWmc(IF<(m(H7mx2T$n@_LGDDT+fw~sVY+CNk)Z_`4blQb&p_qoS&#=16rmCi zF-pIG%^>Zq#F%MFkdunWtVw-f@;v07l7RP>5jon!nCQG=of}JuSFP+^e>m2|?WGZYN zcQ9L36s5KTHcYg29Jm>r(Ba-|HnR|(nMqn>o>M$}tU5*-Bg5ix#);VdCLVr%lTXo7 zsio?>SF-nC3*~((H#1*T*T4*^jL0KkKS=gsPrT_rX|M>s^`UC)B5CFi71_r;U|FU2}004*p00ICY zFaa9dHX0(C2dU_R=pX<9XwVrRpa`CzC#q@c05oW502%-Q07_|7Aw4Ecn?f0)W{HWT zL4W|#L;Vr~C^N8ePzY=&PU1tjo%n8 Date: Sun, 8 Apr 2018 21:16:20 +0100 Subject: [PATCH 26/26] Flake 8 --- gensim/models/tfidfmodel.py | 4 +++- gensim/test/test_tfidfmodel.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index d6c7a2efaf..68e83d8e6f 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -96,6 +96,7 @@ def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): """ return add + np.log(float(totaldocs) / docfreq) / np.log(log_base) + def precompute_idfs(wglobal, dfs, total_docs): """Pre-compute the inverse document frequency mapping for all terms. @@ -118,6 +119,7 @@ def precompute_idfs(wglobal, dfs, total_docs): # this method is here just to speed things up a little. return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} + def updated_wlocal(tf, n_tf): """A scheme to transform `tf` or term frequency based on the value of `n_tf`. @@ -293,7 +295,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize - self.num_docs, self.num_nnz, self.idfs = None, None, None + self.num_docs, self.num_nnz, self.idfs = None, None, None self.smartirs = smartirs self.slope = slope self.pivot = pivot diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index 8338fdbd5d..79e3742d48 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -92,8 +92,8 @@ def test_persistence(self): # Test persistence between Gensim v3.2.0 and current model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) - idfs3=[model3.idfs[key] for key in sorted(model3.idfs.keys())] - idfs4=[model4.idfs[key] for key in sorted(model4.idfs.keys())] + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) @@ -113,8 +113,8 @@ def test_persistence(self): # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) - idfs3=[model3.idfs[key] for key in sorted(model3.idfs.keys())] - idfs4=[model4.idfs[key] for key in sorted(model4.idfs.keys())] + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) @@ -146,8 +146,8 @@ def test_persistence_compressed(self): # Test persistence between Gensim v3.2.0 and current compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) - idfs3=[model3.idfs[key] for key in sorted(model3.idfs.keys())] - idfs4=[model4.idfs[key] for key in sorted(model4.idfs.keys())] + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) @@ -167,8 +167,8 @@ def test_persistence_compressed(self): # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) - idfs3=[model3.idfs[key] for key in sorted(model3.idfs.keys())] - idfs4=[model4.idfs[key] for key in sorted(model4.idfs.keys())] + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) @@ -241,7 +241,7 @@ def test_consistency(self): (5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526) ] ] - + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))