Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added encoding parameter to TextDirectoryCorpus #3317

Merged
merged 1 commit into from
Apr 15, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions gensim/corpora/textcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@

"""


from __future__ import with_statement

import logging
Expand All @@ -50,6 +49,8 @@
)
from gensim.utils import deaccent, simple_tokenize

from smart_open import open

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -399,7 +400,7 @@ class TextDirectoryCorpus(TextCorpus):
"""

def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None,
pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs):
pattern=None, exclude_pattern=None, lines_are_documents=False, encoding='utf-8', **kwargs):
"""

Parameters
Expand All @@ -423,6 +424,8 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept
Regex to use for file name exclusion, all files matching this pattern will be ignored.
lines_are_documents : bool, optional
If True - each line is considered a document, otherwise - each file is one document.
encoding : str, optional
Encoding used to read the specified file or files in the specified directory.
kwargs: keyword arguments passed through to the `TextCorpus` constructor.
See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these.

Expand All @@ -432,6 +435,7 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept
self.pattern = pattern
self.exclude_pattern = exclude_pattern
self.lines_are_documents = lines_are_documents
self.encoding = encoding
super(TextDirectoryCorpus, self).__init__(input, dictionary, metadata, **kwargs)

@property
Expand Down Expand Up @@ -510,7 +514,7 @@ def getstream(self):
"""
num_texts = 0
for path in self.iter_filepaths():
with open(path, 'rt') as f:
with open(path, 'rt', encoding=self.encoding) as f:
if self.lines_are_documents:
for line in f:
yield line.strip()
Expand Down