diff --git a/obonet/io.py b/obonet/io.py index 2651f81..7112377 100644 --- a/obonet/io.py +++ b/obonet/io.py @@ -1,5 +1,6 @@ import importlib import io +import logging import mimetypes import re from urllib.request import urlopen @@ -29,6 +30,7 @@ def open_read_file(path): content = response.read() if opener == io.open: encoding = response.headers.get_content_charset(failobj="utf-8") + logging.info(f"Will decode content from {path} using {encoding} charset.") text = content.decode(encoding) return io.StringIO(text) else: @@ -39,7 +41,7 @@ def open_read_file(path): return opener(path, "rt") -encoding_to_module = { +compression_to_module = { "gzip": "gzip", "bzip2": "bz2", "xz": "lzma", @@ -50,10 +52,10 @@ def get_opener(filename): """ Automatically detect compression and return the file opening function. """ - type_, encoding = mimetypes.guess_type(filename) - if encoding is None: + _type, compression = mimetypes.guess_type(filename) + if compression is None: opener = io.open else: - module = encoding_to_module[encoding] + module = compression_to_module[compression] opener = importlib.import_module(module).open return opener diff --git a/tests/data/brenda-subset.obo b/tests/data/brenda-subset.obo index 9d6439b..7c40871 100755 --- a/tests/data/brenda-subset.obo +++ b/tests/data/brenda-subset.obo @@ -6,7 +6,7 @@ synonymtypedef: GE "LANGUAGE GERMAN" synonymtypedef: SCI "SCIENTIFIC NAME" default-namespace: BrendaTissueOBO remark: www.brenda-enzymes.org -comment: "this is a subset of the BRENDA OBO file for testing purposes" ! See https://github.com/dhimmel/obonet/issues/10 +comment: "this is a subset of the BRENDA OBO file for testing purposes. ™⏸⟟⎞▹◬⽷⹽⫥⠷⩶⥣ⱸ♖⬭⌉⌐⦦" ! See https://github.com/dhimmel/obonet/issues/10 and https://github.com/dhimmel/obonet/issues/27 [Term] id: BTO:0000000 diff --git a/tests/test_obo_reading.py b/tests/test_obo_reading.py index 7377dfe..fb59721 100755 --- a/tests/test_obo_reading.py +++ b/tests/test_obo_reading.py @@ -51,13 +51,16 @@ def test_read_taxrank_url(extension): def test_read_brenda_subset(): """ Test reading a subset of the BrendaTissue.obo file. This file does not set - the ontology tag. See https://github.com/dhimmel/obonet/issues/10. + the ontology tag. See . + It also contains some unicode characters that should fail if not read as utf-8, + see . """ path = os.path.join(directory, "data", "brenda-subset.obo") brenda = obonet.read_obo(path) assert len(brenda) == 1 assert "name" not in brenda.graph assert "ontology" not in brenda.graph + assert "™⏸⟟⎞▹◬⽷⹽⫥⠷⩶⥣ⱸ♖⬭⌉⌐⦦" in brenda.graph["comment"][0] @pytest.mark.parametrize("ontology", ["doid", "go", "pato"])