From b3fc152117757fe7a3a6215776000bb909ec5067 Mon Sep 17 00:00:00 2001 From: samyak jain Date: Fri, 16 Feb 2018 21:33:54 +0530 Subject: [PATCH] Fixes #1869 , Mmcorpus file-like object behaviour fixed --- gensim/matutils.py | 67 +++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/gensim/matutils.py b/gensim/matutils.py index 665f5d73a7..22299d0cc5 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -25,7 +25,7 @@ from six import iteritems, itervalues, string_types from six.moves import xrange, zip as izip - +from contextlib import contextmanager logger = logging.getLogger(__name__) @@ -1328,10 +1328,23 @@ class MmReader(object): This allows us to process corpora which are larger than the available RAM. """ + @contextmanager + def open_file(self, input): + #Generates 'with' like behaviour excepting closing the file object + mgr = utils.file_or_filename(self.input) + exc = False + try: + yield mgr + except StandardError: + exc = True + if not exit(mgr, *sys.exc_info()): + raise + finally: + if not exc and isinstance(self.input, string_types): + exit(mgr, None, None, None) def __init__(self, input, transposed=True): """ - Parameters ---------- input : {str, file-like object} @@ -1344,39 +1357,25 @@ def __init__(self, input, transposed=True): logger.info("initializing corpus reader from %s", input) self.input, self.transposed = input, transposed - # 'with' statement behaviour without closing the file object - mgr = (utils.file_or_filename(self.input)) - exit = type(mgr).__exit__ - value = type(mgr).__enter__(mgr) - exc = True - try: + with self.open_file(self.input) as lines: try: - lines = value - try: - header = utils.to_unicode(next(lines)).strip() - if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): - raise ValueError( - "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % - (self.input, header) - ) - except StopIteration: - pass - - self.num_docs = self.num_terms = self.num_nnz = 0 - for lineno, line in enumerate(lines): - line = utils.to_unicode(line) - if not line.startswith('%'): - self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) - if not self.transposed: - self.num_docs, self.num_terms = self.num_terms, self.num_docs - break - except RuntimeError: - exc = False - if not exit(mgr, *sys.exc_info()): - raise - finally: - if exc and isinstance(self.input, string_types): - exit(mgr, None, None, None) + header = utils.to_unicode(next(lines)).strip() + if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): + raise ValueError( + "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % + (self.input, header) + ) + except StopIteration: + pass + + self.num_docs = self.num_terms = self.num_nnz = 0 + for lineno, line in enumerate(lines): + line = utils.to_unicode(line) + if not line.startswith('%'): + self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) + if not self.transposed: + self.num_docs, self.num_terms = self.num_terms, self.num_docs + break logger.info( "accepted corpus with %i documents, %i features, %i non-zero entries",