test_read_brenda_subset: add unicode characters

refs #27
dhimmel · Feb 28, 2023 · e6ff647 · e6ff647
1 parent 508e5fd
commit e6ff647
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 6 deletions.
diff --git a/obonet/io.py b/obonet/io.py
@@ -1,5 +1,6 @@
 import importlib
 import io
+import logging
 import mimetypes
 import re
 from urllib.request import urlopen
@@ -29,6 +30,7 @@ def open_read_file(path):
             content = response.read()
         if opener == io.open:
             encoding = response.headers.get_content_charset(failobj="utf-8")
+            logging.info(f"Will decode content from {path} using {encoding} charset.")
             text = content.decode(encoding)
             return io.StringIO(text)
         else:
@@ -39,7 +41,7 @@ def open_read_file(path):
     return opener(path, "rt")
 
 
-encoding_to_module = {
+compression_to_module = {
     "gzip": "gzip",
     "bzip2": "bz2",
     "xz": "lzma",
@@ -50,10 +52,10 @@ def get_opener(filename):
     """
     Automatically detect compression and return the file opening function.
     """
-    type_, encoding = mimetypes.guess_type(filename)
-    if encoding is None:
+    _type, compression = mimetypes.guess_type(filename)
+    if compression is None:
         opener = io.open
     else:
-        module = encoding_to_module[encoding]
+        module = compression_to_module[compression]
         opener = importlib.import_module(module).open
     return opener
diff --git a/tests/data/brenda-subset.obo b/tests/data/brenda-subset.obo
@@ -6,7 +6,7 @@ synonymtypedef: GE "LANGUAGE GERMAN"
 synonymtypedef: SCI "SCIENTIFIC NAME"
 default-namespace: BrendaTissueOBO
 remark: www.brenda-enzymes.org
-comment: "this is a subset of the BRENDA OBO file for testing purposes" ! See https://github.com/dhimmel/obonet/issues/10 
+comment: "this is a subset of the BRENDA OBO file for testing purposes. ™⏸⟟⎞▹◬⽷⹽⫥⠷⩶⥣ⱸ♖⬭⌉⌐⦦" ! See https://github.com/dhimmel/obonet/issues/10 and https://github.com/dhimmel/obonet/issues/27
 
 [Term]
 id: BTO:0000000

diff --git a/tests/test_obo_reading.py b/tests/test_obo_reading.py
@@ -51,13 +51,16 @@ def test_read_taxrank_url(extension):
 def test_read_brenda_subset():
     """
     Test reading a subset of the BrendaTissue.obo file. This file does not set
-    the ontology tag. See https://github.com/dhimmel/obonet/issues/10.
+    the ontology tag. See <https://github.com/dhimmel/obonet/issues/10>.
+    It also contains some unicode characters that should fail if not read as utf-8,
+    see <https://github.com/dhimmel/obonet/issues/27>.
     """
     path = os.path.join(directory, "data", "brenda-subset.obo")
     brenda = obonet.read_obo(path)
     assert len(brenda) == 1
     assert "name" not in brenda.graph
     assert "ontology" not in brenda.graph
+    assert "™⏸⟟⎞▹◬⽷⹽⫥⠷⩶⥣ⱸ♖⬭⌉⌐⦦" in brenda.graph["comment"][0]
 
 
 @pytest.mark.parametrize("ontology", ["doid", "go", "pato"])