diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index f0d015bd4b..b43d9d4a30 100755 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -108,7 +108,9 @@ def segment_and_write_all_articles(file_path, output_file, min_article_character Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None. """ - if output_file is not None: + if output_file is None: + outfile = getattr(sys.stdout, 'buffer', sys.stdout) # we want write bytes, so for py3 we used 'buffer' + else: outfile = smart_open(output_file, 'wb') try: @@ -120,10 +122,7 @@ def segment_and_write_all_articles(file_path, output_file, min_article_character output_data["section_texts"].append(section_content) if (idx + 1) % 100000 == 0: logger.info("processed #%d articles (at %r now)", idx + 1, article_title) - if output_file is None: - sys.stdout.write(json.dumps(output_data) + "\n") - else: - outfile.write((json.dumps(output_data) + "\n").encode()) + outfile.write((json.dumps(output_data) + "\n").encode('utf-8')) finally: if output_file is not None: outfile.close()