From 40989bac545ee5ad2c1444fab06bff09a33df0f3 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Mon, 29 Apr 2019 20:29:50 -0300 Subject: [PATCH 01/16] adding type check for corpus_file argument --- gensim/models/doc2vec.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 9812dc5ef4..6a2f553626 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -794,6 +794,11 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor """ kwargs = {} + + # Check the type of corpus_file + if not isinstance(corpus_file, string_types): + raise TypeError("Parameter corpus_file of train() must be a string (path to a file).") + if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) offsets, start_doctags = self._get_offsets_and_start_doctags_for_corpusfile(corpus_file, self.workers) From ad76b83653e23d760e378d471e5fc1f32e55418a Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Tue, 30 Apr 2019 20:33:48 -0300 Subject: [PATCH 02/16] fixes to handle different typeerror in train parameters, adding unittests --- gensim/test/test_doc2vec.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index d9af7070d3..279f7b0ef4 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -173,6 +173,23 @@ def testLoadOldModel(self): sims_to_infer = loaded_model.docvecs.most_similar([doc0_inferred], topn=len(loaded_model.docvecs)) self.assertTrue(sims_to_infer) + def testDoc2vecTrainParameters(self): + + model = doc2vec.Doc2Vec(vector_size=50) + model.build_vocab(documents=list_corpus) + + # check if corpus_file is not a string + self.assertRaises(TypeError, model.train, corpus_file=11111) + # check if documents is an iterable (but not string) + self.assertRaises(TypeError, model.train, documents="blabla") + # check is both the parameters are provided + self.assertRaises(TypeError, model.train, documents=sentences, corpus_file='test') + # check if both the parameters are left empty + self.assertRaises(TypeError, model.train, documents=None, corpus_file=None) + # check if corpus_file is an iterable + self.assertRaises(TypeError, model.train, corpus_file=sentences) + + @unittest.skipIf(os.name == 'nt', "See another test for Windows below") def test_get_offsets_and_start_doctags(self): # Each line takes 6 bytes (including '\n' character) @@ -387,7 +404,7 @@ def model_sanity(self, model, keep_training=True): tmpf = get_tmpfile('gensim_doc2vec.tst') model.save(tmpf) loaded = doc2vec.Doc2Vec.load(tmpf) - loaded.train(sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs) + loaded.train(documents=sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs) def test_training(self): """Test doc2vec training.""" From e1f32a2accdb49fa12d0fef62f34bba2e74b3cfc Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Tue, 30 Apr 2019 20:41:48 -0300 Subject: [PATCH 03/16] adding doc2vec with more typeerror checks --- gensim/models/doc2vec.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 6a2f553626..7766d450ba 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -70,7 +70,7 @@ except ImportError: from Queue import Queue # noqa:F401 -from collections import namedtuple, defaultdict +from collections import namedtuple, defaultdict, Iterable from timeit import default_timer from numpy import zeros, float32 as REAL, empty, ones, \ @@ -795,9 +795,21 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor """ kwargs = {} + # Check if both documents and corpus_file are None + if corpus_file is None and documents is None: + raise TypeError("Either one of corpus_file or documents value must be provided.") + + # Check if both documents and corpus_file are not None + if not corpus_file is None and not documents is None: + raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") + + # Check if documents is not None and iterable but not string type + if not documents is None and isinstance(documents, Iterable) and isinstance(documents, string_types): + raise TypeError("Documents must be an iterable of list and not a string type.") + # Check the type of corpus_file - if not isinstance(corpus_file, string_types): - raise TypeError("Parameter corpus_file of train() must be a string (path to a file).") + if corpus_file is not None and not isinstance(corpus_file, string_types): + raise TypeError(f"Parameter corpus_file of train() must be a string (path to a file) got {corpus_file} instead.") if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) From a9eeddfd579e83b92f2d7a88101500ff1c4269d3 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Tue, 30 Apr 2019 20:53:15 -0300 Subject: [PATCH 04/16] fixing lint errors --- gensim/models/doc2vec.py | 7 ++++--- gensim/test/test_doc2vec.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 7766d450ba..61521c98ff 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -800,16 +800,17 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor raise TypeError("Either one of corpus_file or documents value must be provided.") # Check if both documents and corpus_file are not None - if not corpus_file is None and not documents is None: + if corpus_file is not None and documents is not None: raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") # Check if documents is not None and iterable but not string type - if not documents is None and isinstance(documents, Iterable) and isinstance(documents, string_types): + if documents is not None and isinstance(documents, Iterable) and isinstance(documents, string_types): raise TypeError("Documents must be an iterable of list and not a string type.") # Check the type of corpus_file if corpus_file is not None and not isinstance(corpus_file, string_types): - raise TypeError(f"Parameter corpus_file of train() must be a string (path to a file) got {corpus_file} instead.") + raise TypeError(f"""Parameter corpus_file of train() must be a + string (path to a file) got {corpus_file} instead.""") if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 279f7b0ef4..7301f808cc 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -189,7 +189,6 @@ def testDoc2vecTrainParameters(self): # check if corpus_file is an iterable self.assertRaises(TypeError, model.train, corpus_file=sentences) - @unittest.skipIf(os.name == 'nt', "See another test for Windows below") def test_get_offsets_and_start_doctags(self): # Each line takes 6 bytes (including '\n' character) From 6639089fdc26e94751b133a365271e344e31c055 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Tue, 30 Apr 2019 21:21:58 -0300 Subject: [PATCH 05/16] removing f-string use --- gensim/models/doc2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 61521c98ff..bebf261fa8 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -809,8 +809,8 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor # Check the type of corpus_file if corpus_file is not None and not isinstance(corpus_file, string_types): - raise TypeError(f"""Parameter corpus_file of train() must be a - string (path to a file) got {corpus_file} instead.""") + raise TypeError("""Parameter corpus_file of train() must be a + string (path to an existing file) got %s instead.""" % corpus_file) if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) From 2ca51ca4a18c7811f9e32115e2649d5906d63916 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 1 May 2019 08:07:24 -0300 Subject: [PATCH 06/16] fixes as suggested --- gensim/models/doc2vec.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index bebf261fa8..90197c9cf1 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -803,14 +803,13 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor if corpus_file is not None and documents is not None: raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") - # Check if documents is not None and iterable but not string type - if documents is not None and isinstance(documents, Iterable) and isinstance(documents, string_types): + # Check if documents is not None and not string type either + if documents is not None and isinstance(documents, string_types): raise TypeError("Documents must be an iterable of list and not a string type.") # Check the type of corpus_file - if corpus_file is not None and not isinstance(corpus_file, string_types): - raise TypeError("""Parameter corpus_file of train() must be a - string (path to an existing file) got %s instead.""" % corpus_file) + if not isinstance(corpus_file, string_types) and documents is None: + raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file) if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) From 6458aea3a1dfc8d8c400ccdc7543434dfaf9f542 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 1 May 2019 08:51:05 -0300 Subject: [PATCH 07/16] remove unused imports --- gensim/models/doc2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 90197c9cf1..acff036577 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -70,7 +70,7 @@ except ImportError: from Queue import Queue # noqa:F401 -from collections import namedtuple, defaultdict, Iterable +from collections import namedtuple, defaultdict from timeit import default_timer from numpy import zeros, float32 as REAL, empty, ones, \ From bc9dce66119d70efa372484835556ae4bc646bb2 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 1 May 2019 21:36:20 -0300 Subject: [PATCH 08/16] using xor as suggested --- gensim/models/doc2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index acff036577..b82da0afe2 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -800,12 +800,12 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor raise TypeError("Either one of corpus_file or documents value must be provided.") # Check if both documents and corpus_file are not None - if corpus_file is not None and documents is not None: + if not ((corpus_file is None) ^ (documents is None)): raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") # Check if documents is not None and not string type either if documents is not None and isinstance(documents, string_types): - raise TypeError("Documents must be an iterable of list and not a string type.") + raise TypeError("documents must be an iterable of list and not a string type.") # Check the type of corpus_file if not isinstance(corpus_file, string_types) and documents is None: From 97d56197833a428b99973ecfb24f457862c4f75d Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 1 May 2019 22:22:55 -0300 Subject: [PATCH 09/16] minor fixes --- gensim/models/doc2vec.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index b82da0afe2..961a0b661b 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -70,7 +70,7 @@ except ImportError: from Queue import Queue # noqa:F401 -from collections import namedtuple, defaultdict +from collections import namedtuple, defaultdict, Iterable from timeit import default_timer from numpy import zeros, float32 as REAL, empty, ones, \ @@ -803,14 +803,14 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor if not ((corpus_file is None) ^ (documents is None)): raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") - # Check if documents is not None and not string type either - if documents is not None and isinstance(documents, string_types): - raise TypeError("documents must be an iterable of list and not a string type.") - - # Check the type of corpus_file + # Check if corpus_file is string type if not isinstance(corpus_file, string_types) and documents is None: raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file) + # Check if documents is iterable and not string type + if isinstance(documents, Iterable) and isinstance(documents, string_types): + raise TypeError("documents must be an iterable of list and not a string type.") + if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) offsets, start_doctags = self._get_offsets_and_start_doctags_for_corpusfile(corpus_file, self.workers) From cdaff9ff3985d497f540ea38d332dbbac1db8044 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 1 May 2019 22:33:06 -0300 Subject: [PATCH 10/16] only check for iterable --- gensim/models/doc2vec.py | 6 +++--- gensim/test/test_doc2vec.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 961a0b661b..84959a59a3 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -807,9 +807,9 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor if not isinstance(corpus_file, string_types) and documents is None: raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file) - # Check if documents is iterable and not string type - if isinstance(documents, Iterable) and isinstance(documents, string_types): - raise TypeError("documents must be an iterable of list and not a string type.") + # Check if documents is iterable + if not isinstance(documents, Iterable): + raise TypeError("documents must be an iterable of list, got %s instead." % documents) if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 7301f808cc..60e59e278f 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -180,8 +180,8 @@ def testDoc2vecTrainParameters(self): # check if corpus_file is not a string self.assertRaises(TypeError, model.train, corpus_file=11111) - # check if documents is an iterable (but not string) - self.assertRaises(TypeError, model.train, documents="blabla") + # check if documents is an iterable + self.assertRaises(TypeError, model.train, documents=11111) # check is both the parameters are provided self.assertRaises(TypeError, model.train, documents=sentences, corpus_file='test') # check if both the parameters are left empty From b04629229c8d975ae6cb63b249b0c577c93f1169 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Wed, 1 May 2019 23:42:06 -0300 Subject: [PATCH 11/16] minor fix - 2 --- gensim/models/doc2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 84959a59a3..4f68bbb6a7 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -804,11 +804,11 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") # Check if corpus_file is string type - if not isinstance(corpus_file, string_types) and documents is None: + if documents is None and not isinstance(corpus_file, string_types): raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file) # Check if documents is iterable - if not isinstance(documents, Iterable): + if documents is not None and not isinstance(documents, Iterable): raise TypeError("documents must be an iterable of list, got %s instead." % documents) if corpus_file is not None: From 298e8f0ea9e915fed80a8adcf631f86fa887e056 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Thu, 2 May 2019 09:38:41 -0300 Subject: [PATCH 12/16] checking corpus_file path, removing xor --- gensim/models/doc2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 4f68bbb6a7..5a901b5962 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -800,11 +800,11 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor raise TypeError("Either one of corpus_file or documents value must be provided.") # Check if both documents and corpus_file are not None - if not ((corpus_file is None) ^ (documents is None)): + if corpus_file is not None and documents is not None: raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") # Check if corpus_file is string type - if documents is None and not isinstance(corpus_file, string_types): + if documents is None and not os.path.isfile(corpus_file): raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file) # Check if documents is iterable From 4fb01f420cda7b12af4fb7bf046fbe23fca3e1d6 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Sat, 4 May 2019 07:48:11 -0300 Subject: [PATCH 13/16] fixing nitpiks --- gensim/models/doc2vec.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 5a901b5962..99954bba93 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -797,19 +797,19 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor # Check if both documents and corpus_file are None if corpus_file is None and documents is None: - raise TypeError("Either one of corpus_file or documents value must be provided.") + raise TypeError("Either one of corpus_file or documents value must be provided") # Check if both documents and corpus_file are not None if corpus_file is not None and documents is not None: - raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.") + raise TypeError("Both corpus_file and documents must not be provided at the same time") # Check if corpus_file is string type if documents is None and not os.path.isfile(corpus_file): - raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file) + raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) # Check if documents is iterable if documents is not None and not isinstance(documents, Iterable): - raise TypeError("documents must be an iterable of list, got %s instead." % documents) + raise TypeError("documents must be an iterable of list, got %r instead" % documents) if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) From 07a7d5d3a0b28363379d095e6fbc26d9ec3edd54 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Sat, 4 May 2019 14:36:29 -0300 Subject: [PATCH 14/16] parameters check in fasttext module --- gensim/models/fasttext.py | 19 +++++++++++++++++++ gensim/test/test_fasttext.py | 16 ++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 8978d3d2ff..53a7c96162 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -280,10 +280,12 @@ """ import logging +import os import numpy as np from numpy import ones, vstack, float32 as REAL, sum as np_sum import six +from collections import Iterable import gensim.models._fasttext_bin @@ -901,6 +903,23 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) """ + + # Check if both sentences and corpus_file are None + if corpus_file is None and sentences is None: + raise TypeError("Either one of corpus_file or sentences value must be provided") + + # Check if both sentences and corpus_file are not None + if corpus_file is not None and sentences is not None: + raise TypeError("Both corpus_file and sentences must not be provided at the same time") + + # Check if corpus_file is string type + if sentences is None and not os.path.isfile(corpus_file): + raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) + + # Check if sentences is iterable + if sentences is not None and not isinstance(sentences, Iterable): + raise TypeError("sentences must be an iterable of list, got %r instead" % sentences) + super(FastText, self).train( sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 50cacaefdf..d6cb7e51b9 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -95,6 +95,22 @@ def test_training(self): oov_vec = model.wv['minor'] # oov word self.assertEqual(len(oov_vec), 10) + def testFastTextTrainParameters(self): + + model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) + model.build_vocab(sentences=sentences) + + # check if corpus_file is not a string + self.assertRaises(TypeError, model.train, corpus_file=11111) + # check if sentences is an iterable + self.assertRaises(TypeError, model.train, sentences=11111) + # check is both the parameters are provided + self.assertRaises(TypeError, model.train, sentences=sentences, corpus_file='test') + # check if both the parameters are left empty + self.assertRaises(TypeError, model.train, sentences=None, corpus_file=None) + # check if corpus_file is an iterable + self.assertRaises(TypeError, model.train, corpus_file=sentences) + @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: From 8821b92c27a9274ce4530d2e87d9aab076174ab4 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Sat, 4 May 2019 14:53:04 -0300 Subject: [PATCH 15/16] extra space fix --- gensim/test/test_fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index d6cb7e51b9..1e200ffdc7 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -97,7 +97,7 @@ def test_training(self): def testFastTextTrainParameters(self): - model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) + model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences=sentences) # check if corpus_file is not a string From bfc43605147ca29203a1aba58f690f70c70281c9 Mon Sep 17 00:00:00 2001 From: manish saraswat Date: Sat, 4 May 2019 23:25:54 -0300 Subject: [PATCH 16/16] remove comments --- gensim/models/doc2vec.py | 4 ---- gensim/models/fasttext.py | 4 ---- gensim/test/test_doc2vec.py | 5 ----- gensim/test/test_fasttext.py | 5 ----- 4 files changed, 18 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 99954bba93..3e690380f9 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -795,19 +795,15 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor """ kwargs = {} - # Check if both documents and corpus_file are None if corpus_file is None and documents is None: raise TypeError("Either one of corpus_file or documents value must be provided") - # Check if both documents and corpus_file are not None if corpus_file is not None and documents is not None: raise TypeError("Both corpus_file and documents must not be provided at the same time") - # Check if corpus_file is string type if documents is None and not os.path.isfile(corpus_file): raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) - # Check if documents is iterable if documents is not None and not isinstance(documents, Iterable): raise TypeError("documents must be an iterable of list, got %r instead" % documents) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 53a7c96162..9dce98083e 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -904,19 +904,15 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor """ - # Check if both sentences and corpus_file are None if corpus_file is None and sentences is None: raise TypeError("Either one of corpus_file or sentences value must be provided") - # Check if both sentences and corpus_file are not None if corpus_file is not None and sentences is not None: raise TypeError("Both corpus_file and sentences must not be provided at the same time") - # Check if corpus_file is string type if sentences is None and not os.path.isfile(corpus_file): raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) - # Check if sentences is iterable if sentences is not None and not isinstance(sentences, Iterable): raise TypeError("sentences must be an iterable of list, got %r instead" % sentences) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 60e59e278f..6ac510ea8e 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -178,15 +178,10 @@ def testDoc2vecTrainParameters(self): model = doc2vec.Doc2Vec(vector_size=50) model.build_vocab(documents=list_corpus) - # check if corpus_file is not a string self.assertRaises(TypeError, model.train, corpus_file=11111) - # check if documents is an iterable self.assertRaises(TypeError, model.train, documents=11111) - # check is both the parameters are provided self.assertRaises(TypeError, model.train, documents=sentences, corpus_file='test') - # check if both the parameters are left empty self.assertRaises(TypeError, model.train, documents=None, corpus_file=None) - # check if corpus_file is an iterable self.assertRaises(TypeError, model.train, corpus_file=sentences) @unittest.skipIf(os.name == 'nt', "See another test for Windows below") diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 1e200ffdc7..f2c8f39c5c 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -100,15 +100,10 @@ def testFastTextTrainParameters(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences=sentences) - # check if corpus_file is not a string self.assertRaises(TypeError, model.train, corpus_file=11111) - # check if sentences is an iterable self.assertRaises(TypeError, model.train, sentences=11111) - # check is both the parameters are provided self.assertRaises(TypeError, model.train, sentences=sentences, corpus_file='test') - # check if both the parameters are left empty self.assertRaises(TypeError, model.train, sentences=None, corpus_file=None) - # check if corpus_file is an iterable self.assertRaises(TypeError, model.train, corpus_file=sentences) @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27")