Skip to content

Commit

Permalink
test_read_brenda_subset: add unicode characters
Browse files Browse the repository at this point in the history
refs #27
  • Loading branch information
dhimmel committed Feb 28, 2023
1 parent 508e5fd commit e6ff647
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 6 deletions.
10 changes: 6 additions & 4 deletions obonet/io.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import importlib
import io
import logging
import mimetypes
import re
from urllib.request import urlopen
Expand Down Expand Up @@ -29,6 +30,7 @@ def open_read_file(path):
content = response.read()
if opener == io.open:
encoding = response.headers.get_content_charset(failobj="utf-8")
logging.info(f"Will decode content from {path} using {encoding} charset.")
text = content.decode(encoding)
return io.StringIO(text)
else:
Expand All @@ -39,7 +41,7 @@ def open_read_file(path):
return opener(path, "rt")


encoding_to_module = {
compression_to_module = {
"gzip": "gzip",
"bzip2": "bz2",
"xz": "lzma",
Expand All @@ -50,10 +52,10 @@ def get_opener(filename):
"""
Automatically detect compression and return the file opening function.
"""
type_, encoding = mimetypes.guess_type(filename)
if encoding is None:
_type, compression = mimetypes.guess_type(filename)
if compression is None:
opener = io.open
else:
module = encoding_to_module[encoding]
module = compression_to_module[compression]
opener = importlib.import_module(module).open
return opener
2 changes: 1 addition & 1 deletion tests/data/brenda-subset.obo
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ synonymtypedef: GE "LANGUAGE GERMAN"
synonymtypedef: SCI "SCIENTIFIC NAME"
default-namespace: BrendaTissueOBO
remark: www.brenda-enzymes.org
comment: "this is a subset of the BRENDA OBO file for testing purposes" ! See https://github.com/dhimmel/obonet/issues/10
comment: "this is a subset of the BRENDA OBO file for testing purposes. ™⏸⟟⎞▹◬⽷⹽⫥⠷⩶⥣ⱸ♖⬭⌉⌐⦦" ! See https://github.com/dhimmel/obonet/issues/10 and https://github.com/dhimmel/obonet/issues/27

[Term]
id: BTO:0000000
Expand Down
5 changes: 4 additions & 1 deletion tests/test_obo_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,16 @@ def test_read_taxrank_url(extension):
def test_read_brenda_subset():
"""
Test reading a subset of the BrendaTissue.obo file. This file does not set
the ontology tag. See https://github.com/dhimmel/obonet/issues/10.
the ontology tag. See <https://github.com/dhimmel/obonet/issues/10>.
It also contains some unicode characters that should fail if not read as utf-8,
see <https://github.com/dhimmel/obonet/issues/27>.
"""
path = os.path.join(directory, "data", "brenda-subset.obo")
brenda = obonet.read_obo(path)
assert len(brenda) == 1
assert "name" not in brenda.graph
assert "ontology" not in brenda.graph
assert "™⏸⟟⎞▹◬⽷⹽⫥⠷⩶⥣ⱸ♖⬭⌉⌐⦦" in brenda.graph["comment"][0]


@pytest.mark.parametrize("ontology", ["doid", "go", "pato"])
Expand Down

0 comments on commit e6ff647

Please sign in to comment.