Skip to content

Commit

Permalink
Fixes piskvorky#1869 , Mmcorpus file-like object behaviour fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
sj29-innovate committed Feb 16, 2018
1 parent 33a88f4 commit 4c6c5a2
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 18 deletions.
51 changes: 33 additions & 18 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,25 +1272,40 @@ def __init__(self, input, transposed=True):
"""
logger.info("initializing corpus reader from %s", input)
self.input, self.transposed = input, transposed
with utils.file_or_filename(self.input) as lines:

# 'with' statement behaviour without closing the file object
mgr = (utils.file_or_filename(self.input))
exit = type(mgr).__exit__
value = type(mgr).__enter__(mgr)
exc = True
try:
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError(
"File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header)
)
except StopIteration:
pass

self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
lines = value
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError(
"File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header)
)
except StopIteration:
pass

self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
except:
exc = False
if not exit(mgr, *sys.exc_info()):
raise
finally:
if isinstance(self.input, string_types):
exit(mgr, None, None, None)

logger.info(
"accepted corpus with %i documents, %i features, %i non-zero entries",
Expand Down
8 changes: 8 additions & 0 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,14 @@ def setUp(self):
def test_serialize_compressed(self):
# MmCorpus needs file write with seek => doesn't support compressed output (only input)
pass

def test_closed_file_object(self):
file_obj = open(datapath('testcorpus.mm'))
f = file_obj.closed
corpus = mmcorpus.MmCorpus(file_obj)
s = file_obj.closed
self.assertEqual(f, 0)
self.assertEqual(s, 0)

def test_load(self):
self.assertEqual(self.corpus.num_docs, 9)
Expand Down

0 comments on commit 4c6c5a2

Please sign in to comment.