diff --git a/docs/notebooks/Tensorboard_visualizations.ipynb b/docs/notebooks/Tensorboard_visualizations.ipynb index 915878097e..89ff67151c 100644 --- a/docs/notebooks/Tensorboard_visualizations.ipynb +++ b/docs/notebooks/Tensorboard_visualizations.ipynb @@ -896,7 +896,7 @@ "source": [ "import pandas as pd\n", "import re\n", - "from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation\n", + "from gensim.utils.text_utils import remove_stopwords, strip_punctuation\n", "from gensim.models import ldamodel\n", "from gensim.corpora.dictionary import Dictionary\n", "\n", diff --git a/docs/notebooks/Topic_dendrogram.ipynb b/docs/notebooks/Topic_dendrogram.ipynb index e1632d9d1a..2d7182eec2 100644 --- a/docs/notebooks/Topic_dendrogram.ipynb +++ b/docs/notebooks/Topic_dendrogram.ipynb @@ -161,7 +161,7 @@ "source": [ "from gensim.models.ldamodel import LdaModel\n", "from gensim.corpora import Dictionary\n", - "from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation\n", + "from gensim.utils.text_utils import remove_stopwords, strip_punctuation\n", "\n", "import numpy as np\n", "import pandas as pd\n", diff --git a/docs/notebooks/Training_visualizations.ipynb b/docs/notebooks/Training_visualizations.ipynb index dfd0d8bb81..e78b752da2 100644 --- a/docs/notebooks/Training_visualizations.ipynb +++ b/docs/notebooks/Training_visualizations.ipynb @@ -48,7 +48,7 @@ "from gensim.corpora import Dictionary\n", "import pandas as pd\n", "import re\n", - "from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation\n", + "from gensim.utils.text_utils import remove_stopwords, strip_punctuation\n", "\n", "import numpy as np\n", "\n", diff --git a/docs/notebooks/Wordrank_comparisons.ipynb b/docs/notebooks/Wordrank_comparisons.ipynb index 7bb7fd22c6..c3871405e7 100644 --- a/docs/notebooks/Wordrank_comparisons.ipynb +++ b/docs/notebooks/Wordrank_comparisons.ipynb @@ -38,7 +38,7 @@ ], "source": [ "import nltk\n", - "from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n", + "from gensim.utils.text_utils import strip_punctuation, strip_multiple_whitespaces\n", "\n", "# Only the brown corpus is needed in case you don't have it.\n", "nltk.download('brown') \n", diff --git a/docs/notebooks/summarization_tutorial.ipynb b/docs/notebooks/summarization_tutorial.ipynb index 2d45a20c74..cb3b1e0ebe 100644 --- a/docs/notebooks/summarization_tutorial.ipynb +++ b/docs/notebooks/summarization_tutorial.ipynb @@ -23,17 +23,13 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,358 : INFO : Pattern library is not installed, lemmatization won't be available.\n", - "2016-09-19 12:45:22,361 : INFO : Could not import Theano, will use standard float for default ShardedCorpus dtype.\n", - "2016-09-19 12:45:22,372 : INFO : 'pattern' package not found; tag filters are not available for English\n" + "2017-10-10 10:19:01,237 : INFO : 'pattern' package not found; tag filters are not available for English\n" ] } ], @@ -41,7 +37,7 @@ "import logging\n", "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", "\n", - "from gensim.summarization import summarize" + "from gensim.models import summarize" ] }, { @@ -54,9 +50,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -99,18 +93,16 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,405 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,405 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,406 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,406 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-10-10 10:19:01,259 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-10-10 10:19:01,260 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-10-10 10:19:01,261 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-10-10 10:19:01,262 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -137,18 +129,16 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,428 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,428 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,429 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,430 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-10-10 10:19:01,270 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-10-10 10:19:01,271 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-10-10 10:19:01,272 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-10-10 10:19:01,272 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -173,18 +163,16 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,446 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,446 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,447 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,447 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-10-10 10:19:01,280 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-10-10 10:19:01,281 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-10-10 10:19:01,282 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-10-10 10:19:01,283 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -213,18 +201,16 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,463 : WARNING : Input text is expected to have at least 10 sentences.\n", - "2016-09-19 12:45:22,464 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:22,464 : INFO : built Dictionary(53 unique tokens: ['realiti', 'averag', 'polic', 'legendari', 'hacker']...) from 6 documents (total 68 corpus positions)\n", - "2016-09-19 12:45:22,465 : WARNING : Input corpus is expected to have at least 10 documents.\n" + "2017-10-10 10:19:01,290 : WARNING : Input text is expected to have at least 10 sentences.\n", + "2017-10-10 10:19:01,291 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-10-10 10:19:01,292 : INFO : built Dictionary(53 unique tokens: [u'electrochem', u'real', u'captur', u'mind', u'agent']...) from 6 documents (total 68 corpus positions)\n", + "2017-10-10 10:19:01,293 : WARNING : Input corpus is expected to have at least 10 documents.\n" ] }, { @@ -251,9 +237,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -265,13 +249,13 @@ "neo\n", "humans body\n", "super\n", - "reality\n", - "hacker\n" + "hacker\n", + "reality\n" ] } ], "source": [ - "from gensim.summarization import keywords\n", + "from gensim.models import keywords\n", "\n", "print ('Keywords:')\n", "print (keywords(text))" @@ -290,18 +274,15 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:22,510 : INFO : Starting new HTTP connection (1): rare-technologies.com\n", - "2016-09-19 12:45:23,035 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2016-09-19 12:45:23,042 : INFO : built Dictionary(1093 unique tokens: ['realiti', 'keanu', 'miseri', 'vestig', 'massiv']...) from 416 documents (total 2985 corpus positions)\n" + "2017-10-10 10:19:02,079 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-10-10 10:19:02,089 : INFO : built Dictionary(1093 unique tokens: [u'code', u'squiddi', u'relai', u'dinosaur', u'electron']...) from 416 documents (total 2985 corpus positions)\n" ] }, { @@ -355,16 +336,34 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2016-09-19 12:45:25,227 : INFO : Starting new HTTP connection (1): rare-technologies.com\n" + "2017-10-10 10:19:05,119 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", + "2017-10-10 10:19:05,127 : INFO : built Dictionary(1054 unique tokens: [u'fawn', u'windi', u'concept', u'doctor', u'gant']...) from 227 documents (total 2434 corpus positions)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary:\n", + "The answering machine records a woman introducing herself as Maude Lebowski and saying that she is the one who took his rug and has sent a car to pick Dude up at his apartment.\n", + "As he climbs out of bed to make a White Russian, Maude asks about the apartment and Dude explains that Treehorn's thugs most likely vandalized it looking for Lebowski's money.\n", + "\n", + "Keywords:\n", + "dude\n", + "dudes\n", + "walter\n", + "lebowski\n", + "brandt\n", + "maude\n", + "donny\n", + "bunny\n" ] } ], @@ -413,23 +412,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" + "pygments_lexer": "ipython2", + "version": "2.7.13" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/docs/notebooks/topic_network.ipynb b/docs/notebooks/topic_network.ipynb index 416cdbc5c5..8c8c6c6225 100644 --- a/docs/notebooks/topic_network.ipynb +++ b/docs/notebooks/topic_network.ipynb @@ -27,7 +27,7 @@ "from gensim.corpora import Dictionary\n", "import pandas as pd\n", "import re\n", - "from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation\n", + "from gensim.utils.text_utils import remove_stopwords, strip_punctuation\n", "\n", "import numpy as np" ] diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 80bfd8547a..eca441371e 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -9,7 +9,8 @@ Modules: :maxdepth: 0 interfaces - utils + utils/utils + utils/text_utils matutils corpora/bleicorpus corpora/csvcorpus @@ -45,6 +46,20 @@ Modules: models/fasttext models/phrases models/coherencemodel + models/_coherence/aggregation + models/_coherence/direct_confirmation_measure + models/_coherence/indirect_confirmation_measure + models/_coherence/probability_estimation + models/_coherence/segmentation + models/_coherence/text_analysis + models/summarization/bm25 + models/summarization/commons + models/summarization/graph + models/summarization/keywords + models/summarization/pagerank_weighted + models/summarization/summariser + models/summarization/syntactic_unit + models/summarization/textcleaner models/basemodel models/callbacks models/wrappers/ldamallet @@ -66,26 +81,6 @@ Modules: sklearn_api/text2bow sklearn_api/tfidf sklearn_api/w2vmodel - topic_coherence/aggregation - topic_coherence/direct_confirmation_measure - topic_coherence/indirect_confirmation_measure - topic_coherence/probability_estimation - topic_coherence/segmentation - topic_coherence/text_analysis scripts/glove2word2vec - scripts/make_wikicorpus - scripts/word2vec_standalone - scripts/make_wiki_online - scripts/make_wiki_online_lemma - scripts/make_wiki_online_nodebug + scripts/make_wiki scripts/word2vec2tensor - parsing/porter - parsing/preprocessing - summarization/bm25 - summarization/commons - summarization/graph - summarization/keywords - summarization/pagerank_weighted - summarization/summariser - summarization/syntactic_unit - summarization/textcleaner diff --git a/docs/src/models/_coherence/aggregation.rst b/docs/src/models/_coherence/aggregation.rst new file mode 100644 index 0000000000..c32f993f5e --- /dev/null +++ b/docs/src/models/_coherence/aggregation.rst @@ -0,0 +1,9 @@ +:mod:`models._coherence.aggregation` -- Aggregation module +========================================================== + +.. automodule:: gensim.models._coherence.aggregation + :synopsis: Aggregation module + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/topic_coherence/direct_confirmation_measure.rst b/docs/src/models/_coherence/direct_confirmation_measure.rst similarity index 51% rename from docs/src/topic_coherence/direct_confirmation_measure.rst rename to docs/src/models/_coherence/direct_confirmation_measure.rst index ad866f1eb0..fce0f96c25 100644 --- a/docs/src/topic_coherence/direct_confirmation_measure.rst +++ b/docs/src/models/_coherence/direct_confirmation_measure.rst @@ -1,7 +1,7 @@ -:mod:`topic_coherence.direct_confirmation_measure` -- Direct confirmation measure module -======================================================================================== +:mod:`models._coherence.direct_confirmation_measure` -- Direct confirmation measure module +========================================================================================== -.. automodule:: gensim.topic_coherence.direct_confirmation_measure +.. automodule:: gensim.models._coherence.direct_confirmation_measure :synopsis: Direct confirmation measure module :members: :inherited-members: diff --git a/docs/src/topic_coherence/indirect_confirmation_measure.rst b/docs/src/models/_coherence/indirect_confirmation_measure.rst similarity index 50% rename from docs/src/topic_coherence/indirect_confirmation_measure.rst rename to docs/src/models/_coherence/indirect_confirmation_measure.rst index 8bac1585b1..dede2cbcec 100644 --- a/docs/src/topic_coherence/indirect_confirmation_measure.rst +++ b/docs/src/models/_coherence/indirect_confirmation_measure.rst @@ -1,7 +1,7 @@ -:mod:`topic_coherence.indirect_confirmation_measure` -- Indirect confirmation measure module -============================================================================================ +:mod:`models._coherence.indirect_confirmation_measure` -- Indirect confirmation measure module +============================================================================================== -.. automodule:: gensim.topic_coherence.indirect_confirmation_measure +.. automodule:: gensim.models._coherence.indirect_confirmation_measure :synopsis: Indirect confirmation measure module :members: :inherited-members: diff --git a/docs/src/topic_coherence/probability_estimation.rst b/docs/src/models/_coherence/probability_estimation.rst similarity index 54% rename from docs/src/topic_coherence/probability_estimation.rst rename to docs/src/models/_coherence/probability_estimation.rst index 2f81571a62..586c3ee1d2 100644 --- a/docs/src/topic_coherence/probability_estimation.rst +++ b/docs/src/models/_coherence/probability_estimation.rst @@ -1,7 +1,7 @@ -:mod:`topic_coherence.probability_estimation` -- Probability estimation module -============================================================================== +:mod:`models._coherence.probability_estimation` -- Probability estimation module +================================================================================ -.. automodule:: gensim.topic_coherence.probability_estimation +.. automodule:: gensim.models._coherence.probability_estimation :synopsis: Probability estimation module :members: :inherited-members: diff --git a/docs/src/models/_coherence/segmentation.rst b/docs/src/models/_coherence/segmentation.rst new file mode 100644 index 0000000000..14031bc203 --- /dev/null +++ b/docs/src/models/_coherence/segmentation.rst @@ -0,0 +1,9 @@ +:mod:`models._coherence.segmentation` -- Segmentation module +============================================================ + +.. automodule:: gensim.models._coherence.segmentation + :synopsis: Segmentation module + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/topic_coherence/text_analysis.rst b/docs/src/models/_coherence/text_analysis.rst similarity index 61% rename from docs/src/topic_coherence/text_analysis.rst rename to docs/src/models/_coherence/text_analysis.rst index f4e3f7254e..d4cd1f1ad3 100644 --- a/docs/src/topic_coherence/text_analysis.rst +++ b/docs/src/models/_coherence/text_analysis.rst @@ -1,7 +1,7 @@ -:mod:`topic_coherence.text_analysis` -- Analyzing the texts of a corpus to accumulate statistical information about word occurrences -==================================================================================================================================== +:mod:`models._coherence.text_analysis` -- Analyzing the texts of a corpus to accumulate statistical information about word occurrences +====================================================================================================================================== -.. automodule:: gensim.topic_coherence.text_analysis +.. automodule:: gensim.models._coherence.text_analysis :synopsis: Analyzing the texts of a corpus to accumulate statistical information about word occurrences :members: :inherited-members: diff --git a/docs/src/summarization/bm25.rst b/docs/src/models/summarization/bm25.rst similarity index 62% rename from docs/src/summarization/bm25.rst rename to docs/src/models/summarization/bm25.rst index 2889788ee4..0bbaa7efda 100644 --- a/docs/src/summarization/bm25.rst +++ b/docs/src/models/summarization/bm25.rst @@ -1,7 +1,7 @@ -:mod:`summarization.bm25` -- BM25 ranking function +:mod:`models.summarization.bm25` -- BM25 ranking function ========================================================= -.. automodule:: gensim.summarization.bm25 +.. automodule:: gensim.models.summarization.bm25 :synopsis: BM25 ranking function :members: :inherited-members: diff --git a/docs/src/models/summarization/commons.rst b/docs/src/models/summarization/commons.rst new file mode 100644 index 0000000000..586b7560a7 --- /dev/null +++ b/docs/src/models/summarization/commons.rst @@ -0,0 +1,9 @@ +:mod:`models.summarization.commons` -- Common graph functions +============================================================= + +.. automodule:: gensim.models.summarization.commons + :synopsis: Common graph functions + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/summarization/graph.rst b/docs/src/models/summarization/graph.rst new file mode 100644 index 0000000000..ab15073d06 --- /dev/null +++ b/docs/src/models/summarization/graph.rst @@ -0,0 +1,9 @@ +:mod:`models.summarization.graph` -- TextRank graph +=================================================== + +.. automodule:: gensim.models.summarization.graph + :synopsis: TextRank graph + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/summarization/keywords.rst b/docs/src/models/summarization/keywords.rst similarity index 56% rename from docs/src/summarization/keywords.rst rename to docs/src/models/summarization/keywords.rst index 041c5dd10b..cdfb3c1ece 100644 --- a/docs/src/summarization/keywords.rst +++ b/docs/src/models/summarization/keywords.rst @@ -1,7 +1,7 @@ -:mod:`summarization.keywords` -- Keywords for TextRank summarization algorithm -============================================================================== +:mod:`models.summarization.keywords` -- Keywords for TextRank summarization algorithm +===================================================================================== -.. automodule:: gensim.summarization.keywords +.. automodule:: gensim.models.summarization.keywords :synopsis: Keywords for TextRank summarization algorithm :members: :inherited-members: diff --git a/docs/src/summarization/pagerank_weighted.rst b/docs/src/models/summarization/pagerank_weighted.rst similarity index 55% rename from docs/src/summarization/pagerank_weighted.rst rename to docs/src/models/summarization/pagerank_weighted.rst index 0dd9638679..e5944fe8ec 100644 --- a/docs/src/summarization/pagerank_weighted.rst +++ b/docs/src/models/summarization/pagerank_weighted.rst @@ -1,7 +1,7 @@ -:mod:`summarization.pagerank_weighted` -- Weighted PageRank algorithm -===================================================================== +:mod:`models.summarization.pagerank_weighted` -- Weighted PageRank algorithm +============================================================================ -.. automodule:: gensim.summarization.pagerank_weighted +.. automodule:: gensim.models.summarization.pagerank_weighted :synopsis: Weighted PageRank algorithm :members: :inherited-members: diff --git a/docs/src/models/summarization/summariser.rst b/docs/src/models/summarization/summariser.rst new file mode 100644 index 0000000000..ef1cd1c86f --- /dev/null +++ b/docs/src/models/summarization/summariser.rst @@ -0,0 +1,9 @@ +:mod:`models.summarization.summarizer` -- TextRank Summariser +============================================================= + +.. automodule:: gensim.models.summarization.summarizer + :synopsis: TextRank Summariser + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/models/summarization/syntactic_unit.rst b/docs/src/models/summarization/syntactic_unit.rst new file mode 100644 index 0000000000..0b060fa0d0 --- /dev/null +++ b/docs/src/models/summarization/syntactic_unit.rst @@ -0,0 +1,9 @@ +:mod:`models.summarization.syntactic_unit` -- Syntactic Unit class +================================================================== + +.. automodule:: gensim.models.summarization.syntactic_unit + :synopsis: Syntactic Unit class + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/summarization/textcleaner.rst b/docs/src/models/summarization/textcleaner.rst similarity index 58% rename from docs/src/summarization/textcleaner.rst rename to docs/src/models/summarization/textcleaner.rst index 72eda3d779..57c0e5d303 100644 --- a/docs/src/summarization/textcleaner.rst +++ b/docs/src/models/summarization/textcleaner.rst @@ -1,7 +1,7 @@ -:mod:`summarization.textcleaner` -- Summarization pre-processing -================================================================ +:mod:`models.summarization.textcleaner` -- Summarization pre-processing +======================================================================= -.. automodule:: gensim.summarization.textcleaner +.. automodule:: gensim.models.summarization.textcleaner :synopsis: Summarization pre-processing :members: :inherited-members: diff --git a/docs/src/parsing/porter.rst b/docs/src/parsing/porter.rst deleted file mode 100644 index 4b8d68c5d8..0000000000 --- a/docs/src/parsing/porter.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`parsing.porter` -- Porter Stemming Algorithm -========================================================= - -.. automodule:: gensim.parsing.porter - :synopsis: Porter Stemming Algorithm - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/parsing/preprocessing.rst b/docs/src/parsing/preprocessing.rst deleted file mode 100644 index 36a2236d07..0000000000 --- a/docs/src/parsing/preprocessing.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`parsing.preprocessing` -- Functions to preprocess raw text -================================================================ - -.. automodule:: gensim.parsing.preprocessing - :synopsis: Functions to preprocess raw text - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/scripts/make_wikicorpus.rst b/docs/src/scripts/make_wiki.rst similarity index 63% rename from docs/src/scripts/make_wikicorpus.rst rename to docs/src/scripts/make_wiki.rst index 56607bd222..d303f858bd 100644 --- a/docs/src/scripts/make_wikicorpus.rst +++ b/docs/src/scripts/make_wiki.rst @@ -1,7 +1,7 @@ -:mod:`scripts.make_wikicorpus` -- Convert articles from a Wikipedia dump to vectors. +:mod:`scripts.make_wiki` -- Convert articles from a Wikipedia dump to vectors. ==================================================================================== -.. automodule:: gensim.scripts.make_wikicorpus +.. automodule:: gensim.scripts.make_wiki :synopsis: Convert articles from a Wikipedia dump to vectors. :members: :inherited-members: diff --git a/docs/src/scripts/make_wiki_online.rst b/docs/src/scripts/make_wiki_online.rst deleted file mode 100644 index fc4e99c839..0000000000 --- a/docs/src/scripts/make_wiki_online.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`scripts.make_wiki_online` -- Convert articles from a Wikipedia dump -========================================================================= - -.. automodule:: gensim.scripts.make_wiki_online - :synopsis: Convert articles from a Wikipedia dump - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/scripts/make_wiki_online_lemma.rst b/docs/src/scripts/make_wiki_online_lemma.rst deleted file mode 100644 index 34b821f775..0000000000 --- a/docs/src/scripts/make_wiki_online_lemma.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`scripts.make_wiki_online_lemma` -- Convert articles from a Wikipedia dump -=============================================================================== - -.. automodule:: gensim.scripts.make_wiki_online_lemma - :synopsis: Convert articles from a Wikipedia dump - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/scripts/make_wiki_online_nodebug.rst b/docs/src/scripts/make_wiki_online_nodebug.rst deleted file mode 100644 index 7558549188..0000000000 --- a/docs/src/scripts/make_wiki_online_nodebug.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`scripts.make_wiki_online_nodebug` -- Convert articles from a Wikipedia dump -================================================================================= - -.. automodule:: gensim.scripts.make_wiki_online_nodebug - :synopsis: Convert articles from a Wikipedia dump - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/scripts/word2vec_standalone.rst b/docs/src/scripts/word2vec_standalone.rst deleted file mode 100644 index 85e7505b47..0000000000 --- a/docs/src/scripts/word2vec_standalone.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`scripts.word2vec_standalone` -- Train word2vec on text file CORPUS -======================================================================== - -.. automodule:: gensim.scripts.word2vec_standalone - :synopsis: Train word2vec on text file CORPUS - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/commons.rst b/docs/src/summarization/commons.rst deleted file mode 100644 index 7e859c8937..0000000000 --- a/docs/src/summarization/commons.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.commons` -- Common graph functions -========================================================= - -.. automodule:: gensim.summarization.commons - :synopsis: Common graph functions - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/graph.rst b/docs/src/summarization/graph.rst deleted file mode 100644 index 909b15cf5e..0000000000 --- a/docs/src/summarization/graph.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.graph` -- TextRank graph -========================================================= - -.. automodule:: gensim.summarization.graph - :synopsis: TextRank graph - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/summariser.rst b/docs/src/summarization/summariser.rst deleted file mode 100644 index 42d2967453..0000000000 --- a/docs/src/summarization/summariser.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.summarizer` -- TextRank Summariser -========================================================= - -.. automodule:: gensim.summarization.summarizer - :synopsis: TextRank Summariser - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/summarization/syntactic_unit.rst b/docs/src/summarization/syntactic_unit.rst deleted file mode 100644 index 5e20ec5a3e..0000000000 --- a/docs/src/summarization/syntactic_unit.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`summarization.syntactic_unit` -- Syntactic Unit class -=========================================================== - -.. automodule:: gensim.summarization.syntactic_unit - :synopsis: Syntactic Unit class - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/topic_coherence/aggregation.rst b/docs/src/topic_coherence/aggregation.rst deleted file mode 100644 index e8330a838f..0000000000 --- a/docs/src/topic_coherence/aggregation.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`topic_coherence.aggregation` -- Aggregation module -======================================================== - -.. automodule:: gensim.topic_coherence.aggregation - :synopsis: Aggregation module - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/topic_coherence/segmentation.rst b/docs/src/topic_coherence/segmentation.rst deleted file mode 100644 index 883bf18c82..0000000000 --- a/docs/src/topic_coherence/segmentation.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`topic_coherence.segmentation` -- Segmentation module -========================================================== - -.. automodule:: gensim.topic_coherence.segmentation - :synopsis: Segmentation module - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/utils/text_utils.rst b/docs/src/utils/text_utils.rst new file mode 100644 index 0000000000..fced978bd3 --- /dev/null +++ b/docs/src/utils/text_utils.rst @@ -0,0 +1,9 @@ +:mod:`utils.text_utils` -- Functions to preprocess raw text +=========================================================== + +.. automodule:: gensim.utils.text_utils + :synopsis: Functions to preprocess raw text + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/docs/src/utils.rst b/docs/src/utils/utils.rst similarity index 85% rename from docs/src/utils.rst rename to docs/src/utils/utils.rst index 8e303efad2..c82df8e5f9 100644 --- a/docs/src/utils.rst +++ b/docs/src/utils/utils.rst @@ -1,7 +1,7 @@ :mod:`utils` -- Various utility functions ========================================== -.. automodule:: gensim.utils +.. automodule:: gensim.utils.utils :synopsis: Various utility functions :members: :inherited-members: diff --git a/gensim/__init__.py b/gensim/__init__.py index e21dfcb5af..4712bee2d8 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -3,7 +3,7 @@ similarities within a corpus of documents. """ -from gensim import parsing, matutils, interfaces, corpora, models, similarities, summarization, utils # noqa:F401 +from gensim import matutils, interfaces, corpora, models, similarities, utils # noqa:F401 import logging __version__ = '3.0.0' diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 23f5fa3bd1..d81231ee20 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -37,14 +37,14 @@ from gensim import interfaces, utils from gensim.corpora.dictionary import Dictionary -from gensim.parsing.preprocessing import STOPWORDS, RE_WHITESPACE +from gensim.utils.text_utils import STOPWORDS, RE_WHITESPACE from gensim.utils import deaccent, simple_tokenize logger = logging.getLogger(__name__) def remove_stopwords(tokens, stopwords=STOPWORDS): - """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.""" + """Remove stopwords using list from `gensim.utils.text_utils.STOPWORDS`.""" return [token for token in tokens if token not in stopwords] @@ -109,7 +109,7 @@ class TextCorpus(interfaces.CorpusABC): 3. collapse multiple whitespaces into a single one 4. tokenize by splitting on whitespace 5. remove words less than 3 characters long - 6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords + 6. remove stopwords; see `gensim.utils.text_utils.STOPWORDS` for the list of stopwords """ def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None): @@ -133,7 +133,7 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter in order, and should return another iterable of tokens. These filters can add, remove, or replace tokens, or do nothing at all. The default token filters remove tokens less than 3 characters long and remove stopwords using the list - in `gensim.parsing.preprocessing.STOPWORDS`. + in `gensim.utils.text_utils.STOPWORDS`. """ self.input = input self.metadata = metadata diff --git a/gensim/examples/dmlcz/__init__.py b/gensim/examples/dmlcz/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py deleted file mode 100644 index 07fc247f8b..0000000000 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Corpus for the DML-CZ project. -""" - - -import logging -import os.path - -from gensim import interfaces, matutils -import dictionary # for constructing word->id mappings - - -logger = logging.getLogger('gensim.corpora.dmlcorpus') - - -class DmlConfig(object): - """ - DmlConfig contains parameters necessary for the abstraction of a 'corpus of - articles' (see the `DmlCorpus` class). - - Articles may come from different sources (=different locations on disk/network, - different file formats etc.), so the main purpose of DmlConfig is to keep all - sources in one place. - - Apart from glueing sources together, DmlConfig also decides where to store - output files and which articles to accept for the corpus (= an additional filter - over the sources). - """ - - def __init__(self, configId, resultDir, acceptLangs=None): - self.resultDir = resultDir # output files will be stored in this directory - self.configId = configId # configId is a string that is used as filename prefix for all files, so keep it simple - self.sources = {} # all article sources; see sources.DmlSource class for an example of source - - if acceptLangs is None: # which languages to accept - acceptLangs = {'any'} # if not specified, accept all languages (including unknown/unspecified) - self.acceptLangs = set(acceptLangs) - logger.info('initialized %s', self) - - def resultFile(self, fname): - return os.path.join(self.resultDir, self.configId + '_' + fname) - - def acceptArticle(self, metadata): - lang = metadata.get('language', 'unk') # if there was no language field in the article metadata, set language to 'unk' = unknown - if 'any' not in self.acceptLangs and lang not in self.acceptLangs: - return False - return True - - def addSource(self, source): - sourceId = str(source) - assert sourceId not in self.sources, "source %s already present in the config!" % sourceId - self.sources[sourceId] = source - - def __str__(self): - return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % - (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs))) -# endclass DmlConfig - - -class DmlCorpus(interfaces.CorpusABC): - """ - DmlCorpus implements a collection of articles. It is initialized via a DmlConfig - object, which holds information about where to look for the articles and how - to process them. - - Apart from being a regular corpus (bag-of-words iterable with a `len()` method), - DmlCorpus has methods for building a dictionary (mapping between words and - their ids). - """ - - def __init__(self): - self.documents = [] - self.config = None - self.dictionary = dictionary.Dictionary() - - def __len__(self): - return len(self.documents) - - def __iter__(self): - """ - The function that defines a corpus -- iterating over the corpus yields - bag-of-words vectors, one for each document. - - A bag-of-words vector is simply a list of ``(tokenId, tokenCount)`` 2-tuples. - """ - for docNo, (sourceId, docUri) in enumerate(self.documents): - source = self.config.sources[sourceId] - - contents = source.getContent(docUri) - words = [source.normalizeWord(word) for word in source.tokenize(contents)] - yield self.dictionary.doc2bow(words, allowUpdate=False) - - def buildDictionary(self): - """ - Populate dictionary mapping and statistics. - - This is done by sequentially retrieving the article fulltexts, splitting - them into tokens and converting tokens to their ids (creating new ids as - necessary). - """ - logger.info("creating dictionary from %i articles", len(self.documents)) - self.dictionary = dictionary.Dictionary() - numPositions = 0 - for docNo, (sourceId, docUri) in enumerate(self.documents): - if docNo % 1000 == 0: - logger.info("PROGRESS: at document #%i/%i (%s, %s)", docNo, len(self.documents), sourceId, docUri) - source = self.config.sources[sourceId] - contents = source.getContent(docUri) - words = [source.normalizeWord(word) for word in source.tokenize(contents)] - numPositions += len(words) - - # convert to bag-of-words, but ignore the result -- here we only care about updating token ids - _ = self.dictionary.doc2bow(words, allowUpdate=True) # noqa:F841 - logger.info("built %s from %i documents (total %i corpus positions)", self.dictionary, len(self.documents), numPositions) - - def processConfig(self, config, shuffle=False): - """ - Parse the directories specified in the config, looking for suitable articles. - - This updates the self.documents var, which keeps a list of (source id, - article uri) 2-tuples. Each tuple is a unique identifier of one article. - - Note that some articles are ignored based on config settings (for example - if the article's language doesn't match any language specified in the - config etc.). - """ - self.config = config - self.documents = [] - logger.info("processing config %s", config) - for sourceId, source in config.sources.iteritems(): - logger.info("processing source '%s'", sourceId) - accepted = [] - for articleUri in source.findArticles(): - meta = source.getMeta(articleUri) # retrieve metadata (= dictionary of key->value) - if config.acceptArticle(meta): # do additional filtering on articles, based on the article's metadata - accepted.append((sourceId, articleUri)) - logger.info("accepted %i articles for source '%s'", len(accepted), sourceId) - self.documents.extend(accepted) - - if not self.documents: - logger.warning('no articles at all found from the config; something went wrong!') - - if shuffle: - logger.info("shuffling %i documents for random order", len(self.documents)) - import random - random.shuffle(self.documents) - - logger.info("accepted total of %i articles for %s", len(self.documents), str(config)) - - def saveDictionary(self, fname): - logger.info("saving dictionary mapping to %s", fname) - fout = open(fname, 'w') - for tokenId, token in self.dictionary.id2token.iteritems(): - fout.write("%i\t%s\n" % (tokenId, token)) - fout.close() - - @staticmethod - def loadDictionary(fname): - result = {} - for lineNo, line in enumerate(open(fname)): - pair = line[:-1].split('\t') - if len(pair) != 2: - continue - wordId, word = pair - result[int(wordId)] = word - return result - - def saveDocuments(self, fname): - logger.info("saving documents mapping to %s", fname) - fout = open(fname, 'w') - for docNo, docId in enumerate(self.documents): - sourceId, docUri = docId - intId, pathId = docUri - fout.write("%i\t%s\n" % (docNo, repr(docId))) - fout.close() - - def saveAsText(self): - """ - Store the corpus to disk, in a human-readable text format. - - This actually saves multiple files: - - 1. Pure document-term co-occurence frequency counts, as a Matrix Market file. - 2. Token to integer mapping, as a text file. - 3. Document to document URI mapping, as a text file. - - The exact filesystem paths and filenames are determined from the config. - """ - self.saveDictionary(self.config.resultFile('wordids.txt')) - self.saveDocuments(self.config.resultFile('docids.txt')) - matutils.MmWriter.writeCorpus(self.config.resultFile('bow.mm'), self) - - def articleDir(self, docNo): - """ - Return absolute normalized path on filesystem to article no. `docNo`. - """ - sourceId, (_, outPath) = self.documents[docNo] - source = self.config.sources[sourceId] - return os.path.join(source.baseDir, outPath) - - def getMeta(self, docNo): - """ - Return metadata for article no. `docNo`. - """ - sourceId, uri = self.documents[docNo] - source = self.config.sources[sourceId] - return source.getMeta(uri) -# endclass DmlCorpus diff --git a/gensim/examples/dmlcz/gensim_build.py b/gensim/examples/dmlcz/gensim_build.py deleted file mode 100755 index bb62103109..0000000000 --- a/gensim/examples/dmlcz/gensim_build.py +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -USAGE: %(program)s LANGUAGE - Process the repository, accepting articles in LANGUAGE (or 'any'). - Store the word co-occurence matrix and id mappings, which are needed for subsequent processing. - -Example: ./gensim_build.py eng -""" - - -import logging -import sys -import os.path - -from gensim.corpora import sources, dmlcorpus - -PREFIX = 'dmlcz' - -AT_HOME = False - -if AT_HOME: - SOURCE_LIST = [ - sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), - sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), - sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), - ] - - RESULT_DIR = '/Users/kofola/workspace/dml/data/results' - -else: - - SOURCE_LIST = [ - sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), - sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), - sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), - ] - - RESULT_DIR = '/data/dmlcz/xrehurek/results' - - -def buildDmlCorpus(config): - dml = dmlcorpus.DmlCorpus() - dml.processConfig(config, shuffle=True) - dml.buildDictionary() - dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words - - dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) - dml.saveAsText() # save id mappings and documents as text data (matrix market format) - return dml - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level=logging.INFO) - logging.info("running %s", ' '.join(sys.argv)) - - program = os.path.basename(sys.argv[0]) - - # check and process input arguments - if len(sys.argv) < 2: - print(globals()['__doc__'] % locals()) - sys.exit(1) - language = sys.argv[1] - - # construct the config, which holds information about sources, data file filenames etc. - config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language]) - for source in SOURCE_LIST: - config.addSource(source) - buildDmlCorpus(config) - - logging.info("finished running %s", program) diff --git a/gensim/examples/dmlcz/gensim_genmodel.py b/gensim/examples/dmlcz/gensim_genmodel.py deleted file mode 100755 index a2f2b792e7..0000000000 --- a/gensim/examples/dmlcz/gensim_genmodel.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -USAGE: %(program)s LANGUAGE METHOD - Generate topic models for the specified subcorpus. METHOD is currently one \ -of 'tfidf', 'lsi', 'lda', 'rp'. - -Example: ./gensim_genmodel.py any lsi -""" - - -import logging -import sys -import os.path - -from gensim.corpora import dmlcorpus, MmCorpus -from gensim.models import lsimodel, ldamodel, tfidfmodel, rpmodel - -import gensim_build - - -# internal method parameters -DIM_RP = 300 # dimensionality for random projections -DIM_LSI = 200 # for lantent semantic indexing -DIM_LDA = 100 # for latent dirichlet allocation - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level=logging.INFO) - logging.info("running %s", ' '.join(sys.argv)) - - program = os.path.basename(sys.argv[0]) - - # check and process input arguments - if len(sys.argv) < 3: - print(globals()['__doc__'] % locals()) - sys.exit(1) - language = sys.argv[1] - method = sys.argv[2].strip().lower() - - logging.info("loading corpus mappings") - config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), - resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) - - logging.info("loading word id mapping from %s", config.resultFile('wordids.txt')) - id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) - logging.info("loaded %i word ids", len(id2word)) - - corpus = MmCorpus(config.resultFile('bow.mm')) - - if method == 'tfidf': - model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) - model.save(config.resultFile('model_tfidf.pkl')) - elif method == 'lda': - model = ldamodel.LdaModel(corpus, id2word=id2word, num_topics=DIM_LDA) - model.save(config.resultFile('model_lda.pkl')) - elif method == 'lsi': - # first, transform word counts to tf-idf weights - tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) - # then find the transformation from tf-idf to latent space - model = lsimodel.LsiModel(tfidf[corpus], id2word=id2word, num_topics=DIM_LSI) - model.save(config.resultFile('model_lsi.pkl')) - elif method == 'rp': - # first, transform word counts to tf-idf weights - tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) - # then find the transformation from tf-idf to latent space - model = rpmodel.RpModel(tfidf[corpus], id2word=id2word, num_topics=DIM_RP) - model.save(config.resultFile('model_rp.pkl')) - else: - raise ValueError('unknown topic extraction method: %s' % repr(method)) - - MmCorpus.saveCorpus(config.resultFile('%s.mm' % method), model[corpus]) - - logging.info("finished running %s", program) diff --git a/gensim/examples/dmlcz/gensim_xml.py b/gensim/examples/dmlcz/gensim_xml.py deleted file mode 100755 index 0b8661ac77..0000000000 --- a/gensim/examples/dmlcz/gensim_xml.py +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -USAGE: %(program)s LANGUAGE METHOD - Generate similar.xml files, using a previously built model for METHOD. - -Example: ./gensim_xml.py eng lsi -""" - - -import logging -import sys -import os.path - -from gensim.corpora import dmlcorpus, MmCorpus -from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity - -import gensim_build - - -# set to True to do everything EXCEPT actually writing out similar.xml files to disk. -# similar.xml files are NOT written if DRY_RUN is true. -DRY_RUN = False - -# how many 'most similar' documents to store in each similar.xml? -MIN_SCORE = 0.0 # prune based on similarity score (all below MIN_SCORE are ignored) -MAX_SIMILAR = 10 # prune based on rank (at most MAX_SIMILAR are stored). set to 0 to store all of them (no limit). - -# if there are no similar articles (after the pruning), do we still want to generate similar.xml? -SAVE_EMPTY = True - -# xml template for similar articles -ARTICLE = """ -
- - %(author)s - - %(title)s - %(suffix)s - - - -
""" - -# template for the whole similar.xml file (will be filled with multiple ARTICLE instances) -SIMILAR = """\ - -%s - -""" - - -def generateSimilar(corpus, index, method): - for docNo, topSims in enumerate(index): # for each document - # store similarities to the following file - outfile = os.path.join(corpus.articleDir(docNo), 'similar_%s.xml' % method) - - articles = [] # collect similars in this list - for docNo2, score in topSims: # for each most similar article - if score > MIN_SCORE and docNo != docNo2: # if similarity is above MIN_SCORE and not identity (=always maximum similarity, boring) - source, (intId, pathId) = corpus.documents[docNo2] - meta = corpus.getMeta(docNo2) - suffix, author, title = '', meta.get('author', ''), meta.get('title', '') - articles.append(ARTICLE % locals()) # add the similar article to output - if len(articles) >= MAX_SIMILAR: - break - - # now `articles` holds multiple strings in similar_*.xml format - if SAVE_EMPTY or articles: - output = ''.join(articles) # concat all similars to one string - if not DRY_RUN: # only open output files for writing if DRY_RUN is false - logging.info("generating %s (%i similars)", outfile, len(articles)) - outfile = open(outfile, 'w') - outfile.write(SIMILAR % output) # add xml headers and print to file - outfile.close() - else: - logging.info("would be generating %s (%i similars):%s\n", outfile, len(articles), output) - else: - logging.debug("skipping %s (no similar found)", outfile) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level=logging.INFO) - logging.info("running %s", ' '.join(sys.argv)) - - program = os.path.basename(sys.argv[0]) - - # check and process input arguments - if len(sys.argv) < 3: - print(globals()['__doc__'] % locals()) - sys.exit(1) - language = sys.argv[1] - method = sys.argv[2].strip().lower() - - logging.info("loading corpus mappings") - config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), - resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) - - logging.info("loading word id mapping from %s", config.resultFile('wordids.txt')) - id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) - logging.info("loaded %i word ids", len(id2word)) - - corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) - input = MmCorpus(config.resultFile('_%s.mm' % method)) - assert len(input) == len(corpus), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (len(input), len(corpus)) - - # initialize structure for similarity queries - if method == 'lsi' or method == 'rp': # for these methods, use dense vectors - index = MatrixSimilarity(input, num_best=MAX_SIMILAR + 1, num_features=input.numTerms) - else: - index = SparseMatrixSimilarity(input, num_best=MAX_SIMILAR + 1) - - index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) - generateSimilar(corpus, index, method) # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format - - logging.info("finished running %s", program) diff --git a/gensim/examples/dmlcz/runall.sh b/gensim/examples/dmlcz/runall.sh deleted file mode 100644 index 236c1dce80..0000000000 --- a/gensim/examples/dmlcz/runall.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -# full path to gensim executables -BIN_PATH=~/xrehurek/gensim/dmlcz - -# intermediate data will be stored to this dir -RESULT_PATH=~/xrehurek/results - -# set python path, so that python can find and import gensim modules -export PYTHONPATH=~/xrehurek:${PYTHONPATH} - -# Language is set to 'any', meaning all articles are processed for similarity in -# one go, regardless of their language. -# Set language to 'eng', 'fre', 'rus' etc. to only process a specific subset of -# articles (an article's language is determined from its metadata). -language=any - - -# ========== parse all article sources, build article co-occurence matrix ====== -${BIN_PATH}/gensim_build.py ${language} 2>&1 | tee ${RESULT_PATH}/gensim_build.log - - -# ========== build transformation models ======================================= -for method in tfidf rp; -do - ( ${BIN_PATH}/gensim_genmodel.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log ) & -done -wait - -method=lsi -${BIN_PATH}/gensim_genmodel.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_genmodel_${method}.log - - -# =========== generate output xml files ======================================== -# generate xml files for all methods at once, in parallel, to save time. -# NOTE if out of memory, move tfidf out of the loop (tfidf uses a lot of memory here) -for method in tfidf lsi rp; -do - ( ${BIN_PATH}/gensim_xml.py ${language} ${method} 2>&1 | tee ${RESULT_PATH}/gensim_xml_${method}.log ) & -done -wait diff --git a/gensim/examples/dmlcz/sources.py b/gensim/examples/dmlcz/sources.py deleted file mode 100644 index 4193da0820..0000000000 --- a/gensim/examples/dmlcz/sources.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -This module contains implementations (= different classes) which encapsulate the -idea of a Digital Library document source. - -A document source is basically a collection of articles sharing the same format, -same location (type of access), same way of parsing them etc. - -Different sources can be aggregated into a single corpus, which is what the -`DmlCorpus` class does (see the `dmlcorpus` module). -""" - -import logging -import os -import os.path -import re - -import xml.sax # for parsing arxmliv articles - -from gensim import utils - -import sys -if sys.version_info[0] >= 3: - unicode = str - -PAT_TAG = re.compile('<(.*?)>(.*)') -logger = logging.getLogger('gensim.corpora.sources') - - -class ArticleSource(object): - """ - Objects of this class describe a single source of articles. - - A source is an abstraction over where the documents reside (the findArticles() - method), how to retrieve their fulltexts, their metadata, how to tokenize the - articles and how to normalize the tokens. - - What is NOT abstracted away (ie. must hold for all sources) is the idea of - article identifiers (URIs), which uniquely identify each article within - one source. - - This class is just an ABC interface; see eg. DmlSource or ArxmlivSource classes - for concrete instances. - """ - - def __init__(self, sourceId): - self.sourceId = sourceId - - def __str__(self): - return self.sourceId - - def findArticles(self): - raise NotImplementedError('Abstract Base Class') - - def getContent(self, uri): - raise NotImplementedError('Abstract Base Class') - - def getMeta(self, uri): - raise NotImplementedError('Abstract Base Class') - - def tokenize(self, content): - raise NotImplementedError('Abstract Base Class') - - def normalizeWord(self, word): - raise NotImplementedError('Abstract Base Class') -# endclass ArticleSource - - -class DmlSource(ArticleSource): - """ - Article source for articles in DML format (DML-CZ, Numdam): - 1) articles = directories starting with '#' - 2) content is stored in fulltext.txt - 3) metadata are stored in meta.xml - - Article URI is currently (a part of) the article's path on filesystem. - - See the ArticleSource class for general info on sources. - """ - - def __init__(self, sourceId, baseDir): - self.sourceId = sourceId - self.baseDir = os.path.normpath(baseDir) - - def __str__(self): - return self.sourceId - - @classmethod - def parseDmlMeta(cls, xmlfile): - """ - Parse out all fields from meta.xml, return them as a dictionary. - """ - result = {} - xml = open(xmlfile) - for line in xml: - if line.find('
') >= 0: # skip until the beginning of
tag - break - for line in xml: - if line.find('
') >= 0: # end of
, we're done - break - p = re.search(PAT_TAG, line) # HAX assumes one element = one line; proper xml parsing probably better... but who cares - if p: - name, cont = p.groups() - name = name.split()[0] - name, cont = name.strip(), cont.strip() - if name == 'msc': - if len(cont) != 5: - logger.warning('invalid MSC=%s in %s', cont, xmlfile) - result.setdefault('msc', []).append(cont) - continue - if name == 'idMR': - cont = cont[2:] # omit MR from MR123456 - if name and cont: - result[name] = cont - xml.close() - return result - - def idFromDir(self, path): - assert len(path) > len(self.baseDir) - intId = path[1 + path.rfind('#'):] - pathId = path[1 + len(self.baseDir):] - return (intId, pathId) - - def isArticle(self, path): - # in order to be valid, the article directory must start with '#' - if not os.path.basename(path).startswith('#'): - return False - # and contain the fulltext.txt file - if not os.path.exists(os.path.join(path, 'fulltext.txt')): - logger.info('missing fulltext in %s', path) - return False - # and also the meta.xml file - if not os.path.exists(os.path.join(path, 'meta.xml')): - logger.info('missing meta.xml in %s', path) - return False - return True - - def findArticles(self): - dirTotal = artAccepted = 0 - logger.info("looking for '%s' articles inside %s", self.sourceId, self.baseDir) - for root, dirs, files in os.walk(self.baseDir): - dirTotal += 1 - root = os.path.normpath(root) - if self.isArticle(root): - artAccepted += 1 - yield self.idFromDir(root) - logger.info('%i directories processed, found %i articles', dirTotal, artAccepted) - - def getContent(self, uri): - """ - Return article content as a single large string. - """ - intId, pathId = uri - filename = os.path.join(self.baseDir, pathId, 'fulltext.txt') - return open(filename).read() - - def getMeta(self, uri): - """ - Return article metadata as a attribute->value dictionary. - """ - intId, pathId = uri - filename = os.path.join(self.baseDir, pathId, 'meta.xml') - return DmlSource.parseDmlMeta(filename) - - def tokenize(self, content): - return [token.encode('utf8') for token in utils.tokenize(content, errors='ignore') if not token.isdigit()] - - def normalizeWord(self, word): - wordU = unicode(word, 'utf8') - return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring -# endclass DmlSource - - -class DmlCzSource(DmlSource): - """ - Article source for articles in DML-CZ format: - 1) articles = directories starting with '#' - 2) content is stored in fulltext.txt or fulltext_dspace.txt - 3) there exists a dspace_id file, containing internal dmlcz id - 3) metadata are stored in meta.xml - - See the ArticleSource class for general info on sources. - """ - - def idFromDir(self, path): - assert len(path) > len(self.baseDir) - dmlczId = open(os.path.join(path, 'dspace_id')).read().strip() - pathId = path[1 + len(self.baseDir):] - return (dmlczId, pathId) - - def isArticle(self, path): - # in order to be valid, the article directory must start with '#' - if not os.path.basename(path).startswith('#'): - return False - # and contain a dspace_id file - if not (os.path.exists(os.path.join(path, 'dspace_id'))): - logger.info('missing dspace_id in %s', path) - return False - # and contain either fulltext.txt or fulltext_dspace.txt file - if not (os.path.exists(os.path.join(path, 'fulltext.txt')) or os.path.exists(os.path.join(path, 'fulltext-dspace.txt'))): - logger.info('missing fulltext in %s', path) - return False - # and contain the meta.xml file - if not os.path.exists(os.path.join(path, 'meta.xml')): - logger.info('missing meta.xml in %s', path) - return False - return True - - def getContent(self, uri): - """ - Return article content as a single large string. - """ - intId, pathId = uri - filename1 = os.path.join(self.baseDir, pathId, 'fulltext.txt') - filename2 = os.path.join(self.baseDir, pathId, 'fulltext-dspace.txt') - - if os.path.exists(filename1) and os.path.exists(filename2): - # if both fulltext and dspace files exist, pick the larger one - if os.path.getsize(filename1) < os.path.getsize(filename2): - filename = filename2 - else: - filename = filename1 - elif os.path.exists(filename1): - filename = filename1 - else: - assert os.path.exists(filename2) - filename = filename2 - return open(filename).read() -# endclass DmlCzSource - - -class ArxmlivSource(ArticleSource): - """ - Article source for articles in arxmliv format: - 1) articles = directories starting with '#' - 2) content is stored in tex.xml - 3) metadata in special tags within tex.xml - - Article URI is currently (a part of) the article's path on filesystem. - - See the ArticleSource class for general info on sources. - """ - class ArxmlivContentHandler(xml.sax.handler.ContentHandler): - def __init__(self): - self.path = [''] # help structure for sax event parsing - self.tokens = [] # will contain tokens once parsing is finished - - def startElement(self, name, attr): - # for math tokens, we only care about Math elements directly below

- if name == 'Math' and self.path[-1] == 'p' and attr.get('mode', '') == 'inline': - tex = attr.get('tex', '') - if tex and not tex.isdigit(): - self.tokens.append('$%s$' % tex.encode('utf8')) - self.path.append(name) - - def endElement(self, name): - self.path.pop() - - def characters(self, text): - # for text, we only care about tokens directly within the

tag - if self.path[-1] == 'p': - tokens = [token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if not token.isdigit()] - self.tokens.extend(tokens) - # endclass ArxmlivHandler - - class ArxmlivErrorHandler(xml.sax.handler.ErrorHandler): - # Python2.5 implementation of xml.sax is broken -- character streams and - # byte encodings of InputSource are ignored, bad things sometimes happen - # in buffering of multi-byte files (such as utf8), characters get cut in - # the middle, resulting in invalid tokens... - # This is not really a problem with arxmliv xml files themselves, so ignore - # these errors silently. - def error(self, exception): - pass - - warning = fatalError = error - # endclass ArxmlivErrorHandler - - def __init__(self, sourceId, baseDir): - self.sourceId = sourceId - self.baseDir = os.path.normpath(baseDir) - - def __str__(self): - return self.sourceId - - def idFromDir(self, path): - assert len(path) > len(self.baseDir) - intId = path[1 + path.rfind('#'):] - pathId = path[1 + len(self.baseDir):] - return (intId, pathId) - - def isArticle(self, path): - # in order to be valid, the article directory must start with '#' - if not os.path.basename(path).startswith('#'): - return False - # and contain the tex.xml file - if not os.path.exists(os.path.join(path, 'tex.xml')): - logger.warning('missing tex.xml in %s', path) - return False - return True - - def findArticles(self): - dirTotal = artAccepted = 0 - logger.info("looking for '%s' articles inside %s", self.sourceId, self.baseDir) - for root, dirs, files in os.walk(self.baseDir): - dirTotal += 1 - root = os.path.normpath(root) - if self.isArticle(root): - artAccepted += 1 - yield self.idFromDir(root) - logger.info('%i directories processed, found %i articles', dirTotal, artAccepted) - - def getContent(self, uri): - """ - Return article content as a single large string. - """ - intId, pathId = uri - filename = os.path.join(self.baseDir, pathId, 'tex.xml') - return open(filename).read() - - def getMeta(self, uri): - """ - Return article metadata as an attribute->value dictionary. - """ -# intId, pathId = uri -# filename = os.path.join(self.baseDir, pathId, 'tex.xml') - return {'language': 'eng'} # TODO maybe parse out some meta; but currently not needed for anything... - - def tokenize(self, content): - """ - Parse tokens out of xml. There are two types of token: normal text and - mathematics. Both are returned interspersed in a single list, in the same - order as they appeared in the content. - - The math tokens will be returned in the form $tex_expression$, ie. with - a dollar sign prefix and suffix. - """ - handler = ArxmlivSource.ArxmlivContentHandler() - xml.sax.parseString(content, handler, ArxmlivSource.ArxmlivErrorHandler()) - return handler.tokens - - def normalizeWord(self, word): - if word[0] == '$': # ignore math tokens - return word - wordU = unicode(word, 'utf8') - return wordU.lower().encode('utf8') # lowercase and then convert back to bytestring -# endclass ArxmlivSource diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 530f7c4980..6948d8fdfd 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -19,6 +19,7 @@ from .normmodel import NormModel # noqa:F401 from .atmodel import AuthorTopicModel # noqa:F401 from .ldaseqmodel import LdaSeqModel # noqa:F401 +from .summarization import keywords, summarize, summarize_corpus # noqa:F401 from . import wrappers # noqa:F401 diff --git a/gensim/topic_coherence/__init__.py b/gensim/models/_coherence/__init__.py similarity index 100% rename from gensim/topic_coherence/__init__.py rename to gensim/models/_coherence/__init__.py diff --git a/gensim/topic_coherence/aggregation.py b/gensim/models/_coherence/aggregation.py similarity index 100% rename from gensim/topic_coherence/aggregation.py rename to gensim/models/_coherence/aggregation.py diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/models/_coherence/direct_confirmation_measure.py similarity index 100% rename from gensim/topic_coherence/direct_confirmation_measure.py rename to gensim/models/_coherence/direct_confirmation_measure.py diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/models/_coherence/indirect_confirmation_measure.py similarity index 98% rename from gensim/topic_coherence/indirect_confirmation_measure.py rename to gensim/models/_coherence/indirect_confirmation_measure.py index 33b42223bb..f6571b779c 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/models/_coherence/indirect_confirmation_measure.py @@ -32,9 +32,7 @@ import numpy as np import scipy.sparse as sps - -from gensim.topic_coherence.direct_confirmation_measure import ( - aggregate_segment_sims, log_ratio_measure) +from gensim.models._coherence.direct_confirmation_measure import aggregate_segment_sims, log_ratio_measure logger = logging.getLogger(__name__) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/models/_coherence/probability_estimation.py similarity index 97% rename from gensim/topic_coherence/probability_estimation.py rename to gensim/models/_coherence/probability_estimation.py index f59692bdcc..dbc0067715 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/models/_coherence/probability_estimation.py @@ -11,9 +11,9 @@ import itertools import logging -from gensim.topic_coherence.text_analysis import ( - CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, - WordVectorsAccumulator) +from gensim.models._coherence.text_analysis import ( + CorpusAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, WordVectorsAccumulator +) logger = logging.getLogger(__name__) diff --git a/gensim/topic_coherence/segmentation.py b/gensim/models/_coherence/segmentation.py similarity index 100% rename from gensim/topic_coherence/segmentation.py rename to gensim/models/_coherence/segmentation.py diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/models/_coherence/text_analysis.py similarity index 99% rename from gensim/topic_coherence/text_analysis.py rename to gensim/models/_coherence/text_analysis.py index 340286c8d1..4aa4025da5 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/models/_coherence/text_analysis.py @@ -31,7 +31,7 @@ def _ids_to_words(ids, dictionary): Args: ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). - This is the format returned by the topic_coherence.segmentation functions. + This is the format returned by the models._coherence.segmentation functions. """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()}) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 583e2999b6..3ff41d65b8 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -23,13 +23,11 @@ from collections import namedtuple import numpy as np - -from gensim import interfaces, matutils -from gensim import utils -from gensim.topic_coherence import (segmentation, probability_estimation, - direct_confirmation_measure, indirect_confirmation_measure, - aggregation) -from gensim.topic_coherence.probability_estimation import unique_ids_from_segments +from gensim.models._coherence import ( + segmentation, probability_estimation, direct_confirmation_measure, indirect_confirmation_measure, aggregation +) +from gensim.models._coherence.probability_estimation import unique_ids_from_segments +from gensim import interfaces, matutils, utils logger = logging.getLogger(__name__) diff --git a/gensim/summarization/__init__.py b/gensim/models/summarization/__init__.py similarity index 100% rename from gensim/summarization/__init__.py rename to gensim/models/summarization/__init__.py diff --git a/gensim/summarization/bm25.py b/gensim/models/summarization/bm25.py similarity index 100% rename from gensim/summarization/bm25.py rename to gensim/models/summarization/bm25.py diff --git a/gensim/summarization/commons.py b/gensim/models/summarization/commons.py similarity index 90% rename from gensim/summarization/commons.py rename to gensim/models/summarization/commons.py index 1c467098f9..6478c4625d 100644 --- a/gensim/summarization/commons.py +++ b/gensim/models/summarization/commons.py @@ -3,7 +3,7 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -from gensim.summarization.graph import Graph +from gensim.models.summarization.graph import Graph def build_graph(sequence): diff --git a/gensim/summarization/graph.py b/gensim/models/summarization/graph.py similarity index 100% rename from gensim/summarization/graph.py rename to gensim/models/summarization/graph.py diff --git a/gensim/summarization/keywords.py b/gensim/models/summarization/keywords.py similarity index 94% rename from gensim/summarization/keywords.py rename to gensim/models/summarization/keywords.py index 1630c9389d..b3cadb7308 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/models/summarization/keywords.py @@ -3,17 +3,17 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank -from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word -from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word -from gensim.summarization.commons import build_graph as _build_graph -from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes -from gensim.utils import to_unicode from itertools import combinations as _combinations -from six.moves.queue import Queue as _Queue -from six.moves import xrange -from six import iteritems +from gensim.models.summarization.commons import build_graph as _build_graph +from gensim.models.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes +from gensim.models.summarization.pagerank_weighted import pagerank_weighted as _pagerank +from gensim.models.summarization.textcleaner import clean_text_by_word as _clean_text_by_word +from gensim.models.summarization.textcleaner import tokenize_by_word as _tokenize_by_word +from gensim.utils import to_unicode +from six import iteritems +from six.moves import xrange +from six.moves.queue import Queue as _Queue WINDOW_SIZE = 2 diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/models/summarization/pagerank_weighted.py similarity index 100% rename from gensim/summarization/pagerank_weighted.py rename to gensim/models/summarization/pagerank_weighted.py diff --git a/gensim/summarization/summarizer.py b/gensim/models/summarization/summarizer.py similarity index 94% rename from gensim/summarization/summarizer.py rename to gensim/models/summarization/summarizer.py index d187330a58..fb2347e436 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/models/summarization/summarizer.py @@ -4,15 +4,15 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html import logging -from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank -from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences -from gensim.summarization.commons import build_graph as _build_graph -from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes -from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights -from gensim.corpora import Dictionary from math import log10 as _log10 -from six.moves import xrange +from gensim.corpora import Dictionary +from gensim.models.summarization.commons import build_graph as _build_graph +from gensim.models.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes +from gensim.models.summarization.bm25 import get_bm25_weights as _bm25_weights +from gensim.models.summarization.pagerank_weighted import pagerank_weighted as _pagerank +from gensim.models.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences +from six.moves import xrange INPUT_MIN_LENGTH = 10 diff --git a/gensim/summarization/syntactic_unit.py b/gensim/models/summarization/syntactic_unit.py similarity index 100% rename from gensim/summarization/syntactic_unit.py rename to gensim/models/summarization/syntactic_unit.py diff --git a/gensim/summarization/textcleaner.py b/gensim/models/summarization/textcleaner.py similarity index 96% rename from gensim/summarization/textcleaner.py rename to gensim/models/summarization/textcleaner.py index fa6a56b887..80c750d340 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/models/summarization/textcleaner.py @@ -3,12 +3,13 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -from gensim.summarization.syntactic_unit import SyntacticUnit -from gensim.parsing.preprocessing import preprocess_documents +import logging +import re + +from gensim.models.summarization.syntactic_unit import SyntacticUnit +from gensim.utils.text_utils import preprocess_documents from gensim.utils import tokenize from six.moves import xrange -import re -import logging logger = logging.getLogger('summa.preprocessing.cleaner') diff --git a/gensim/nosy.py b/gensim/nosy.py deleted file mode 100644 index 0606166449..0000000000 --- a/gensim/nosy.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python - -""" -A simple testrunner for nose (or anything else). - -Watch for changes in all file types specified in 'EXTENSIONS'. -If changes, run test executable in 'EXECUTABLE', with default -arguments 'DEFAULTARGS'. - -The --with-color option needs the "rudolf" nose plugin. See: -http://pypi.python.org/pypi/rudolf/ - -Originally by Jeff Winkler, http://jeffwinkler.net -Forked from wkral http://github.com/wkral/Nosy -""" - -import os -import stat -import time -import datetime -import sys -import fnmatch - - -EXTENSIONS = ['*.py'] -EXECUTABLE = 'nosetests test/' -DEFAULTARGS = '--with-color -exe' # -w tests' - - -def check_sum(): - """ - Return a long which can be used to know if any .py files have changed. - """ - val = 0 - for root, dirs, files in os.walk(os.getcwd()): - for extension in EXTENSIONS: - for f in fnmatch.filter(files, extension): - stats = os.stat(os.path.join(root, f)) - val += stats[stat.ST_SIZE] + stats[stat.ST_MTIME] - return val - - -if __name__ == '__main__': - val = 0 - try: - while True: - if check_sum() != val: - val = check_sum() - os.system('%s %s %s' % (EXECUTABLE, DEFAULTARGS, ' '.join(sys.argv[1:]))) - print(datetime.datetime.now().__str__()) - print('=' * 77) - time.sleep(1) - except KeyboardInterrupt: - print('Goodbye') diff --git a/gensim/parsing/__init__.py b/gensim/parsing/__init__.py deleted file mode 100644 index 5dcc010aec..0000000000 --- a/gensim/parsing/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -This package contains functions to preprocess raw text -""" - -# bring model classes directly into package namespace, to save some typing -from .porter import PorterStemmer # noqa:F401 -from .preprocessing import (remove_stopwords, strip_punctuation, strip_punctuation2, # noqa:F401 - strip_tags, strip_short, strip_numeric, - strip_non_alphanum, strip_multiple_whitespaces, - split_alphanum, stem_text, preprocess_string, - preprocess_documents, read_file, read_files) diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py deleted file mode 100644 index ab25361f60..0000000000 --- a/gensim/parsing/preprocessing.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -import re -import string -import glob - -from gensim import utils -from gensim.parsing.porter import PorterStemmer - - -# improved list from Stone, Denis, Kwantes (2010) -STOPWORDS = """ -a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be -became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can -cannot cant co computer con could couldnt cry de describe -detail did didn do does doesn doing don done down due during -each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen -fify fill find fire first five for former formerly forty found four from front full further get give go -had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie -if in inc indeed interest into is it its itself keep last latter latterly least less ltd -just -kg km -made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely -neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off -often on once one only onto or other others otherwise our ours ourselves out over own part per -perhaps please put rather re -quite -rather really regarding -same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten -than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under -until up unless upon us used using -various very very via -was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you -your yours yourself yourselves -""" -STOPWORDS = frozenset(w for w in STOPWORDS.split() if w) - - -def remove_stopwords(s): - s = utils.to_unicode(s) - return " ".join(w for w in s.split() if w not in STOPWORDS) - - -RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) - - -def strip_punctuation(s): - s = utils.to_unicode(s) - return RE_PUNCT.sub(" ", s) - - -# unicode.translate cannot delete characters like str can -strip_punctuation2 = strip_punctuation -# def strip_punctuation2(s): -# s = utils.to_unicode(s) -# return s.translate(None, string.punctuation) - - -RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) - - -def strip_tags(s): - s = utils.to_unicode(s) - return RE_TAGS.sub("", s) - - -def strip_short(s, minsize=3): - s = utils.to_unicode(s) - return " ".join(e for e in s.split() if len(e) >= minsize) - - -RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) - - -def strip_numeric(s): - s = utils.to_unicode(s) - return RE_NUMERIC.sub("", s) - - -RE_NONALPHA = re.compile(r"\W", re.UNICODE) - - -def strip_non_alphanum(s): - s = utils.to_unicode(s) - return RE_NONALPHA.sub(" ", s) - - -RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) - - -def strip_multiple_whitespaces(s): - s = utils.to_unicode(s) - return RE_WHITESPACE.sub(" ", s) - - -RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) -RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) - - -def split_alphanum(s): - s = utils.to_unicode(s) - s = RE_AL_NUM.sub(r"\1 \2", s) - return RE_NUM_AL.sub(r"\1 \2", s) - - -def stem_text(text): - """ - Return lowercase and (porter-)stemmed version of string `text`. - """ - text = utils.to_unicode(text) - p = PorterStemmer() - return ' '.join(p.stem(word) for word in text.split()) - - -stem = stem_text - -DEFAULT_FILTERS = [ - lambda x: x.lower(), strip_tags, strip_punctuation, - strip_multiple_whitespaces, strip_numeric, - remove_stopwords, strip_short, stem_text -] - - -def preprocess_string(s, filters=DEFAULT_FILTERS): - s = utils.to_unicode(s) - for f in filters: - s = f(s) - return s.split() - - -def preprocess_documents(docs): - return [preprocess_string(d) for d in docs] - - -def read_file(path): - with utils.smart_open(path) as fin: - return fin.read() - - -def read_files(pattern): - return [read_file(fname) for fname in glob.glob(pattern)] diff --git a/gensim/scripts/make_wiki.py b/gensim/scripts/make_wiki.py deleted file mode 120000 index 85ddf6cc4f..0000000000 --- a/gensim/scripts/make_wiki.py +++ /dev/null @@ -1 +0,0 @@ -make_wikicorpus.py \ No newline at end of file diff --git a/gensim/scripts/make_wiki.py b/gensim/scripts/make_wiki.py new file mode 100755 index 0000000000..55fdd3254c --- /dev/null +++ b/gensim/scripts/make_wiki.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Copyright (C) 2012 Lars Buitinck +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +import logging +import os +import sys +import argparse + +from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus +from gensim.models import TfidfModel + + +# Wiki is first scanned for all distinct word types (~7M). The types that +# appear in more than 10% of articles are removed and from the rest, the +# DEFAULT_DICT_SIZE most frequent types are kept. +DEFAULT_DICT_SIZE = 100000 + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + "Convert articles from a Wikipedia dump to (sparse) vectors", + formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument( + "input", help="bz2-compressed dump of Wikipedia articles, in XML format. " + "Typical filename is wiki--pages-articles.xml.bz2 or " + "wiki-latest-pages-articles.xml.bz2" + ) + parser.add_argument( + "output_prefix", + help="Output prefix.\n" + "This actually creates four files:\n" + "* `output_prefix_wordids.txt`: mapping between words and their integer ids\n" + "* `output_prefix_bow.mm`: bag-of-words (word counts) representation, in Matrix Matrix format\n" + "* `output_prefix_tfidf.mm`: TF-IDF representation\n" + "* `output_prefix.tfidf_model`: TF-IDF model dump\n" + ) + parser.add_argument( + "--online", help="Use HashDictionary instead of Dictionary (default: %(default)s)", + action='store_true', default=False + ) + parser.add_argument( + "--lemma", help="Use lemmatization (default: %(default)s)", action='store_true', default=False + ) + parser.add_argument( + "--nodebug", help="Set debug flag for HashDictionary (default: %(default)s)", + action="store_false", default=True + ) + parser.add_argument( + "--dict-size", help="Size of used dict, (default: %(default)s)", type=int, default=DEFAULT_DICT_SIZE + ) + + args = parser.parse_args() + + if not os.path.isdir(os.path.dirname(args.output_prefix)): + raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") + + logger = logging.getLogger(os.path.basename(sys.argv[0])) + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') + logging.root.setLevel(level=logging.INFO) + logger.info("running %s", ' '.join(sys.argv)) + + if args.online: + dictionary = HashDictionary(id_range=args.dict_size, debug=args.nodebug) + dictionary.allow_update = True # start collecting document frequencies + wiki = WikiCorpus(args.input, lemmatize=args.lemma, dictionary=dictionary) + MmCorpus.serialize(args.output_prefix + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) + # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` + dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=args.dict_size) + dictionary.save_as_text(args.output_prefix + '_wordids.txt.bz2') + wiki.save(args.output_prefix + '_corpus.pkl.bz2') + dictionary.allow_update = False + else: + wiki = WikiCorpus(args.input, lemmatize=args.lemma) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) + # only keep the most frequent words (out of total ~8.2m unique tokens) + wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=args.dict_size) + # save dictionary and bag-of-words (term-document frequency matrix) + MmCorpus.serialize(args.output_prefix + '_bow.mm', wiki, progress_cnt=10000) # another ~9h + wiki.dictionary.save_as_text(args.output_prefix + '_wordids.txt.bz2') + # load back the id->word mapping directly from file + # this seems to save more memory, compared to keeping the wiki.dictionary object from above + dictionary = Dictionary.load_from_text(args.output_prefix + '_wordids.txt.bz2') + del wiki + + # initialize corpus reader and word->id mapping + mm = MmCorpus(args.output_prefix + '_bow.mm') + + # build tfidf, ~50min + tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) + tfidf.save(args.output_prefix + '.tfidf_model') + + # save tfidf vectors in matrix market format + # ~4h; result file is 15GB! bzip2'ed down to 4.5GB + MmCorpus.serialize(args.output_prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000) + + logger.info("finished running %s", ' '.join(sys.argv)) diff --git a/gensim/scripts/make_wiki_lemma.py b/gensim/scripts/make_wiki_lemma.py deleted file mode 120000 index 85ddf6cc4f..0000000000 --- a/gensim/scripts/make_wiki_lemma.py +++ /dev/null @@ -1 +0,0 @@ -make_wikicorpus.py \ No newline at end of file diff --git a/gensim/scripts/make_wiki_online.py b/gensim/scripts/make_wiki_online.py deleted file mode 100755 index 37c437f3e1..0000000000 --- a/gensim/scripts/make_wiki_online.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Copyright (C) 2012 Lars Buitinck -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] - -Convert articles from a Wikipedia dump to (sparse) vectors. The input is a -bz2-compressed dump of Wikipedia articles, in XML format. - -This actually creates three files: - -* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids -* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in - Matrix Matrix format -* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation -* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump - -The output Matrix Market files can then be compressed (e.g., by bzip2) to save -disk space; gensim's corpus iterators can work with compressed input, too. - -`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after -removing tokens that appear in more than 10%% of all documents). Defaults to -100,000. - -If you have the `pattern` package installed, this script will use a fancy -lemmatization to get a lemma of each token (instead of plain alphabetic -tokenizer). The package is available at https://github.com/clips/pattern . - -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en -""" - - -import logging -import os.path -import sys - -from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus -from gensim.models import TfidfModel - - -# Wiki is first scanned for all distinct word types (~7M). The types that -# appear in more than 10% of articles are removed and from the rest, the -# DEFAULT_DICT_SIZE most frequent types are kept. -DEFAULT_DICT_SIZE = 100000 - - -if __name__ == '__main__': - program = os.path.basename(sys.argv[0]) - logger = logging.getLogger(program) - - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level=logging.INFO) - logger.info("running %s", ' '.join(sys.argv)) - - # check and process input arguments - if len(sys.argv) < 3: - print(globals()['__doc__'] % locals()) - sys.exit(1) - inp, outp = sys.argv[1:3] - - if not os.path.isdir(os.path.dirname(outp)): - raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") - - if len(sys.argv) > 3: - keep_words = int(sys.argv[3]) - else: - keep_words = DEFAULT_DICT_SIZE - online = 'online' in program - lemmatize = 'lemma' in program - debug = 'nodebug' not in program - - if online: - dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies - wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) - # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` - dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - dictionary.save_as_text(outp + '_wordids.txt.bz2') - wiki.save(outp + '_corpus.pkl.bz2') - dictionary.allow_update = False - else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) - # only keep the most frequent words (out of total ~8.2m unique tokens) - wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h - wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') - # load back the id->word mapping directly from file - # this seems to save more memory, compared to keeping the wiki.dictionary object from above - dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') - del wiki - - # initialize corpus reader and word->id mapping - mm = MmCorpus(outp + '_bow.mm') - - # build tfidf, ~50min - tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) - tfidf.save(outp + '.tfidf_model') - - # save tfidf vectors in matrix market format - # ~4h; result file is 15GB! bzip2'ed down to 4.5GB - MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - - logger.info("finished running %s", program) diff --git a/gensim/scripts/make_wiki_online_lemma.py b/gensim/scripts/make_wiki_online_lemma.py deleted file mode 100755 index 37c437f3e1..0000000000 --- a/gensim/scripts/make_wiki_online_lemma.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Copyright (C) 2012 Lars Buitinck -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] - -Convert articles from a Wikipedia dump to (sparse) vectors. The input is a -bz2-compressed dump of Wikipedia articles, in XML format. - -This actually creates three files: - -* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids -* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in - Matrix Matrix format -* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation -* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump - -The output Matrix Market files can then be compressed (e.g., by bzip2) to save -disk space; gensim's corpus iterators can work with compressed input, too. - -`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after -removing tokens that appear in more than 10%% of all documents). Defaults to -100,000. - -If you have the `pattern` package installed, this script will use a fancy -lemmatization to get a lemma of each token (instead of plain alphabetic -tokenizer). The package is available at https://github.com/clips/pattern . - -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en -""" - - -import logging -import os.path -import sys - -from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus -from gensim.models import TfidfModel - - -# Wiki is first scanned for all distinct word types (~7M). The types that -# appear in more than 10% of articles are removed and from the rest, the -# DEFAULT_DICT_SIZE most frequent types are kept. -DEFAULT_DICT_SIZE = 100000 - - -if __name__ == '__main__': - program = os.path.basename(sys.argv[0]) - logger = logging.getLogger(program) - - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level=logging.INFO) - logger.info("running %s", ' '.join(sys.argv)) - - # check and process input arguments - if len(sys.argv) < 3: - print(globals()['__doc__'] % locals()) - sys.exit(1) - inp, outp = sys.argv[1:3] - - if not os.path.isdir(os.path.dirname(outp)): - raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") - - if len(sys.argv) > 3: - keep_words = int(sys.argv[3]) - else: - keep_words = DEFAULT_DICT_SIZE - online = 'online' in program - lemmatize = 'lemma' in program - debug = 'nodebug' not in program - - if online: - dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies - wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) - # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` - dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - dictionary.save_as_text(outp + '_wordids.txt.bz2') - wiki.save(outp + '_corpus.pkl.bz2') - dictionary.allow_update = False - else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) - # only keep the most frequent words (out of total ~8.2m unique tokens) - wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h - wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') - # load back the id->word mapping directly from file - # this seems to save more memory, compared to keeping the wiki.dictionary object from above - dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') - del wiki - - # initialize corpus reader and word->id mapping - mm = MmCorpus(outp + '_bow.mm') - - # build tfidf, ~50min - tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) - tfidf.save(outp + '.tfidf_model') - - # save tfidf vectors in matrix market format - # ~4h; result file is 15GB! bzip2'ed down to 4.5GB - MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - - logger.info("finished running %s", program) diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py deleted file mode 100755 index 37c437f3e1..0000000000 --- a/gensim/scripts/make_wiki_online_nodebug.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Copyright (C) 2012 Lars Buitinck -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] - -Convert articles from a Wikipedia dump to (sparse) vectors. The input is a -bz2-compressed dump of Wikipedia articles, in XML format. - -This actually creates three files: - -* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids -* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in - Matrix Matrix format -* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation -* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump - -The output Matrix Market files can then be compressed (e.g., by bzip2) to save -disk space; gensim's corpus iterators can work with compressed input, too. - -`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after -removing tokens that appear in more than 10%% of all documents). Defaults to -100,000. - -If you have the `pattern` package installed, this script will use a fancy -lemmatization to get a lemma of each token (instead of plain alphabetic -tokenizer). The package is available at https://github.com/clips/pattern . - -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en -""" - - -import logging -import os.path -import sys - -from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus -from gensim.models import TfidfModel - - -# Wiki is first scanned for all distinct word types (~7M). The types that -# appear in more than 10% of articles are removed and from the rest, the -# DEFAULT_DICT_SIZE most frequent types are kept. -DEFAULT_DICT_SIZE = 100000 - - -if __name__ == '__main__': - program = os.path.basename(sys.argv[0]) - logger = logging.getLogger(program) - - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level=logging.INFO) - logger.info("running %s", ' '.join(sys.argv)) - - # check and process input arguments - if len(sys.argv) < 3: - print(globals()['__doc__'] % locals()) - sys.exit(1) - inp, outp = sys.argv[1:3] - - if not os.path.isdir(os.path.dirname(outp)): - raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") - - if len(sys.argv) > 3: - keep_words = int(sys.argv[3]) - else: - keep_words = DEFAULT_DICT_SIZE - online = 'online' in program - lemmatize = 'lemma' in program - debug = 'nodebug' not in program - - if online: - dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies - wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) - # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` - dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - dictionary.save_as_text(outp + '_wordids.txt.bz2') - wiki.save(outp + '_corpus.pkl.bz2') - dictionary.allow_update = False - else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) - # only keep the most frequent words (out of total ~8.2m unique tokens) - wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h - wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') - # load back the id->word mapping directly from file - # this seems to save more memory, compared to keeping the wiki.dictionary object from above - dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') - del wiki - - # initialize corpus reader and word->id mapping - mm = MmCorpus(outp + '_bow.mm') - - # build tfidf, ~50min - tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) - tfidf.save(outp + '.tfidf_model') - - # save tfidf vectors in matrix market format - # ~4h; result file is 15GB! bzip2'ed down to 4.5GB - MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - - logger.info("finished running %s", program) diff --git a/gensim/scripts/make_wikicorpus.py b/gensim/scripts/make_wikicorpus.py deleted file mode 100755 index 37c437f3e1..0000000000 --- a/gensim/scripts/make_wikicorpus.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Copyright (C) 2012 Lars Buitinck -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] - -Convert articles from a Wikipedia dump to (sparse) vectors. The input is a -bz2-compressed dump of Wikipedia articles, in XML format. - -This actually creates three files: - -* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids -* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in - Matrix Matrix format -* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation -* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump - -The output Matrix Market files can then be compressed (e.g., by bzip2) to save -disk space; gensim's corpus iterators can work with compressed input, too. - -`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after -removing tokens that appear in more than 10%% of all documents). Defaults to -100,000. - -If you have the `pattern` package installed, this script will use a fancy -lemmatization to get a lemma of each token (instead of plain alphabetic -tokenizer). The package is available at https://github.com/clips/pattern . - -Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en -""" - - -import logging -import os.path -import sys - -from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus -from gensim.models import TfidfModel - - -# Wiki is first scanned for all distinct word types (~7M). The types that -# appear in more than 10% of articles are removed and from the rest, the -# DEFAULT_DICT_SIZE most frequent types are kept. -DEFAULT_DICT_SIZE = 100000 - - -if __name__ == '__main__': - program = os.path.basename(sys.argv[0]) - logger = logging.getLogger(program) - - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') - logging.root.setLevel(level=logging.INFO) - logger.info("running %s", ' '.join(sys.argv)) - - # check and process input arguments - if len(sys.argv) < 3: - print(globals()['__doc__'] % locals()) - sys.exit(1) - inp, outp = sys.argv[1:3] - - if not os.path.isdir(os.path.dirname(outp)): - raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") - - if len(sys.argv) > 3: - keep_words = int(sys.argv[3]) - else: - keep_words = DEFAULT_DICT_SIZE - online = 'online' in program - lemmatize = 'lemma' in program - debug = 'nodebug' not in program - - if online: - dictionary = HashDictionary(id_range=keep_words, debug=debug) - dictionary.allow_update = True # start collecting document frequencies - wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) - # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` - dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - dictionary.save_as_text(outp + '_wordids.txt.bz2') - wiki.save(outp + '_corpus.pkl.bz2') - dictionary.allow_update = False - else: - wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) - # only keep the most frequent words (out of total ~8.2m unique tokens) - wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) - # save dictionary and bag-of-words (term-document frequency matrix) - MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h - wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') - # load back the id->word mapping directly from file - # this seems to save more memory, compared to keeping the wiki.dictionary object from above - dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') - del wiki - - # initialize corpus reader and word->id mapping - mm = MmCorpus(outp + '_bow.mm') - - # build tfidf, ~50min - tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) - tfidf.save(outp + '.tfidf_model') - - # save tfidf vectors in matrix market format - # ~4h; result file is 15GB! bzip2'ed down to 4.5GB - MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) - - logger.info("finished running %s", program) diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py deleted file mode 100644 index 878e588613..0000000000 --- a/gensim/scripts/word2vec_standalone.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW --cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER --min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE - -Trains a neural embedding model on text file CORPUS. -Parameters essentially reproduce those used by the original C tool -(see https://code.google.com/archive/p/word2vec/). - -Parameters for training: - -train - Use text data from to train the model - -output - Use to save the resulting word vectors / word clusters - -size - Set size of word vectors; default is 100 - -window - Set max skip length between words; default is 5 - -sample - Set threshold for occurrence of words. Those that appear with higher frequency in the training data - will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5) - -hs - Use Hierarchical Softmax; default is 0 (not used) - -negative - Number of negative examples; default is 5, common values are 3 - 10 (0 = not used) - -threads - Use threads (default 3) - -iter - Run more training iterations (default 5) - -min_count - This will discard words that appear less than times; default is 5 - -alpha - Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW - -binary - Save the resulting vectors in binary moded; default is 0 (off) - -cbow - Use the continuous bag of words model; default is 1 (use 0 for skip-gram model) - -accuracy - Compute accuracy of the resulting model analogical inference power on questions file - See an example of questions file at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt - -Example: python -m gensim.scripts.word2vec_standalone -train data.txt -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -""" - - -import logging -import os.path -import sys -import argparse -from numpy import seterr - -from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle - -logger = logging.getLogger(__name__) - - -if __name__ == "__main__": - logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) - logger.info("running %s", " ".join(sys.argv)) - seterr(all='raise') # don't ignore numpy errors - - parser = argparse.ArgumentParser() - parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) - parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") - parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) - parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument("-sample", help="Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; " - "default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) - parser.add_argument("-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5) - parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3) - parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument("-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5) - parser.add_argument("-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float) - parser.add_argument("-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1]) - parser.add_argument("-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1]) - parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") - - args = parser.parse_args() - - if args.cbow == 0: - skipgram = 1 - if not args.alpha: - args.alpha = 0.025 - else: - skipgram = 0 - if not args.alpha: - args.alpha = 0.05 - - corpus = LineSentence(args.train) - - model = Word2Vec( - corpus, size=args.size, min_count=args.min_count, workers=args.threads, - window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram, - hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter - ) - - if args.output: - outfile = args.output - model.wv.save_word2vec_format(outfile, binary=args.binary) - else: - outfile = args.train.split('.')[0] - model.save(outfile + '.model') - if args.binary == 1: - model.wv.save_word2vec_format(outfile + '.model.bin', binary=True) - else: - model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) - - if args.accuracy: - questions_file = args.accuracy - model.accuracy(questions_file) - - logger.info("finished running %s", os.path.basename(sys.argv[0])) diff --git a/gensim/test/test_aggregation.py b/gensim/test/test_aggregation.py index 5f09c30ccd..575075dcb1 100644 --- a/gensim/test/test_aggregation.py +++ b/gensim/test/test_aggregation.py @@ -11,7 +11,7 @@ import logging import unittest -from gensim.topic_coherence import aggregation +from gensim.models._coherence import aggregation class TestAggregation(unittest.TestCase): diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py index 170aa7288a..c7349847e4 100644 --- a/gensim/test/test_direct_confirmation.py +++ b/gensim/test/test_direct_confirmation.py @@ -12,8 +12,7 @@ import unittest from collections import namedtuple -from gensim.topic_coherence import direct_confirmation_measure -from gensim.topic_coherence import text_analysis +from gensim.models._coherence import text_analysis, direct_confirmation_measure class TestDirectConfirmationMeasure(unittest.TestCase): diff --git a/gensim/test/test_indirect_confirmation.py b/gensim/test/test_indirect_confirmation.py index 007e35a50b..e0b96fd064 100644 --- a/gensim/test/test_indirect_confirmation.py +++ b/gensim/test/test_indirect_confirmation.py @@ -12,10 +12,8 @@ import unittest import numpy as np - from gensim.corpora.dictionary import Dictionary -from gensim.topic_coherence import indirect_confirmation_measure -from gensim.topic_coherence import text_analysis +from gensim.models._coherence import indirect_confirmation_measure, text_analysis class TestIndirectConfirmation(unittest.TestCase): diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index 76bd448d5c..2e099c5267 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -12,12 +12,12 @@ """ -import os.path import logging +import os.path import unittest from gensim import utils -from gensim.summarization import keywords +from gensim.models.summarization import keywords class TestKeywordsTest(unittest.TestCase): diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py index 33cce71e52..b079c59d21 100644 --- a/gensim/test/test_lee.py +++ b/gensim/test/test_lee.py @@ -31,8 +31,7 @@ import numpy as np from gensim import corpora, models, utils, matutils -from gensim.parsing.preprocessing import preprocess_documents, preprocess_string, DEFAULT_FILTERS - +from gensim.utils.text_utils import DEFAULT_FILTERS, preprocess_string, preprocess_documents bg_corpus = None corpus = None diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py index 02ca13fb6b..4aaafb6bd6 100644 --- a/gensim/test/test_parsing.py +++ b/gensim/test/test_parsing.py @@ -8,8 +8,8 @@ import logging import unittest import numpy as np -from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation2, strip_tags, strip_short, strip_numeric, strip_non_alphanum, strip_multiple_whitespaces, split_alphanum, stem_text - +from gensim.utils.text_utils import remove_stopwords, strip_punctuation2, strip_tags, strip_short, strip_numeric, \ + strip_non_alphanum, strip_multiple_whitespaces, split_alphanum, stem_text # several documents doc1 = """C'est un trou de verdure où chante une rivière, diff --git a/gensim/test/test_probability_estimation.py b/gensim/test/test_probability_estimation.py index 1e674415f3..ea91e3bde9 100644 --- a/gensim/test/test_probability_estimation.py +++ b/gensim/test/test_probability_estimation.py @@ -13,7 +13,7 @@ from gensim.corpora.dictionary import Dictionary from gensim.corpora.hashdictionary import HashDictionary -from gensim.topic_coherence import probability_estimation +from gensim.models._coherence import probability_estimation class BaseTestCases(object): diff --git a/gensim/test/test_segmentation.py b/gensim/test/test_segmentation.py index 512121a055..6757ca1902 100644 --- a/gensim/test/test_segmentation.py +++ b/gensim/test/test_segmentation.py @@ -8,13 +8,11 @@ Automated tests for segmentation algorithms in the segmentation module. """ - import logging import unittest import numpy as np - -from gensim.topic_coherence import segmentation +from gensim.models._coherence import segmentation from numpy import array diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index b20a59e512..82fcca765c 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -12,13 +12,13 @@ """ -import os.path import logging +import os.path import unittest from gensim import utils from gensim.corpora import Dictionary -from gensim.summarization import summarize, summarize_corpus, keywords +from gensim.models.summarization import summarize, summarize_corpus, keywords class TestSummarizationTest(unittest.TestCase): diff --git a/gensim/test/test_text_analysis.py b/gensim/test/test_text_analysis.py index 93f00ae3a8..ccb1b691fd 100644 --- a/gensim/test/test_text_analysis.py +++ b/gensim/test/test_text_analysis.py @@ -2,7 +2,7 @@ import unittest from gensim.corpora.dictionary import Dictionary -from gensim.topic_coherence.text_analysis import ( +from gensim.models._coherence.text_analysis import ( InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator, CorpusAccumulator) diff --git a/gensim/utils/__init__.py b/gensim/utils/__init__.py new file mode 100644 index 0000000000..ec6fe4b917 --- /dev/null +++ b/gensim/utils/__init__.py @@ -0,0 +1,2 @@ +from .utils import * # noqa:F401 +from . import text_utils # noqa:F401 diff --git a/gensim/parsing/porter.py b/gensim/utils/text_utils.py similarity index 66% rename from gensim/parsing/porter.py rename to gensim/utils/text_utils.py index 048e056418..ff35d4e84e 100644 --- a/gensim/parsing/porter.py +++ b/gensim/utils/text_utils.py @@ -1,37 +1,13 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Porter Stemming Algorithm -This is the Porter stemming algorithm, ported to Python from the -version coded up in ANSI C by the author. It may be be regarded -as canonical, in that it follows the algorithm presented in - -Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, -no. 3, pp 130-137, - -only differing from it at the points maked --DEPARTURE-- below. - -See also http://www.tartarus.org/~martin/PorterStemmer - -The algorithm as described in the paper could be exactly replicated -by adjusting the points of DEPARTURE, but this is barely necessary, -because (a) the points of DEPARTURE are definitely improvements, and -(b) no encoding of the Porter stemmer I have seen is anything like -as exact as this version, even with the points of DEPARTURE! - -Vivake Gupta (v@nano.com) - -Release 1: January 2001 - -Further adjustments by Santiago Bruno (bananabruno@gmail.com) -to allow word input not restricted to one word per line, leading -to: - -Release 2: July 2008 - -Optimizations and cleanup of the code by Lars Buitinck, July 2012. -""" - +import re +import string +from gensim.utils import utils from six.moves import xrange @@ -369,12 +345,122 @@ def stem_documents(self, docs): return [self.stem_sentence(x) for x in docs] -if __name__ == '__main__': - import sys +STOPWORDS = frozenset({ + 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', + 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', + 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', + 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', + 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', + 'computer', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', + 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', + 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', + 'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', + 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', + 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', + 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'just', + 'keep', 'kg', 'km', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'make', 'many', 'may', 'me', + 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', + 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', + 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', + 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', + 'please', 'put', 'quite', 'rather', 're', 'really', 'regarding', 'same', 'say', 'see', 'seem', 'seemed', + 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', + 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', + 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', + 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', + 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', + 'twelve', 'twenty', 'two', 'un', 'under', 'unless', 'until', 'up', 'upon', 'us', 'used', 'using', 'various', + 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', + 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', + 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', + 'your', 'yours', 'yourself', 'yourselves' +}) + + +def remove_stopwords(s): + s = utils.to_unicode(s) + return " ".join(w for w in s.split() if w not in STOPWORDS) + + +RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) + + +def strip_punctuation(s): + s = utils.to_unicode(s) + return RE_PUNCT.sub(" ", s) + + +strip_punctuation2 = strip_punctuation +RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) + + +def strip_tags(s): + s = utils.to_unicode(s) + return RE_TAGS.sub("", s) + + +def strip_short(s, minsize=3): + s = utils.to_unicode(s) + return " ".join(e for e in s.split() if len(e) >= minsize) + +RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) + + +def strip_numeric(s): + s = utils.to_unicode(s) + return RE_NUMERIC.sub("", s) + + +RE_NONALPHA = re.compile(r"\W", re.UNICODE) + + +def strip_non_alphanum(s): + s = utils.to_unicode(s) + return RE_NONALPHA.sub(" ", s) + + +RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) + + +def strip_multiple_whitespaces(s): + s = utils.to_unicode(s) + return RE_WHITESPACE.sub(" ", s) + + +RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) +RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) + + +def split_alphanum(s): + s = utils.to_unicode(s) + s = RE_AL_NUM.sub(r"\1 \2", s) + return RE_NUM_AL.sub(r"\1 \2", s) + + +def stem_text(text): + """ + Return lowercase and (porter-)stemmed version of string `text`. + """ + text = utils.to_unicode(text) p = PorterStemmer() + return ' '.join(p.stem(word) for word in text.split()) + + +stem = stem_text +DEFAULT_FILTERS = [ + lambda x: x.lower(), strip_tags, strip_punctuation, + strip_multiple_whitespaces, strip_numeric, + remove_stopwords, strip_short, stem_text +] + + +def preprocess_string(s, filters=DEFAULT_FILTERS): + s = utils.to_unicode(s) + for f in filters: + s = f(s) + return s.split() + - for f in sys.argv[1:]: - with open(f) as infile: - for line in infile: - print(p.stem_sentence(line)) +def preprocess_documents(docs): + return [preprocess_string(d) for d in docs] diff --git a/gensim/utils.py b/gensim/utils/utils.py similarity index 100% rename from gensim/utils.py rename to gensim/utils/utils.py