Skip to content

Commit

Permalink
Merge pull request #122 from ResearchObject/fix_is_url
Browse files Browse the repository at this point in the history
Fix logic to detect URLs
  • Loading branch information
simleo authored May 4, 2022
2 parents 83388af + e5d10ba commit 854ba36
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 14 deletions.
27 changes: 14 additions & 13 deletions rocrate/model/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,20 @@ def write(self, base_path):
mode = 'w' + ('b' if isinstance(self.source, BytesIO) else 't')
with open(out_file_path, mode) as out_file:
out_file.write(self.source.getvalue())
elif is_url(str(self.source)) and (self.fetch_remote or self.validate_url):
with urllib.request.urlopen(self.source) as response:
if self.validate_url:
if isinstance(response, HTTPResponse):
self._jsonld.update({
'contentSize': response.getheader('Content-Length'),
'encodingFormat': response.getheader('Content-Type')
})
if not self.fetch_remote:
self._jsonld['sdDatePublished'] = iso_now()
if self.fetch_remote:
out_file_path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(response.url, out_file_path)
elif is_url(str(self.source)):
if self.fetch_remote or self.validate_url:
with urllib.request.urlopen(self.source) as response:
if self.validate_url:
if isinstance(response, HTTPResponse):
self._jsonld.update({
'contentSize': response.getheader('Content-Length'),
'encodingFormat': response.getheader('Content-Type')
})
if not self.fetch_remote:
self._jsonld['sdDatePublished'] = iso_now()
if self.fetch_remote:
out_file_path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(response.url, out_file_path)
elif os.path.isfile(self.source):
out_file_path.parent.mkdir(parents=True, exist_ok=True)
if not out_file_path.exists() or not out_file_path.samefile(self.source):
Expand Down
4 changes: 3 additions & 1 deletion rocrate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,9 @@ def as_list(list_or_other):

def is_url(string):
parts = urlsplit(string)
return all((parts.scheme, parts.netloc, parts.path))
if os.name == "nt" and len(parts.scheme) == 1:
return False
return all((parts.scheme, parts.path))


def iso_now():
Expand Down
8 changes: 8 additions & 0 deletions test/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,14 @@ def test_contextual_entities_hash(test_data_dir):
}))
wf["hasPart"] = [step]
assert step.id == step_id
email = "jscarberry@example.org"
email_uri = f"mailto:{email}"
contact_point = crate.add(ContextEntity(crate, email_uri, properties={
"@type": "ContactPoint",
"email": email
}))
crate.root_dataset["contactPoint"] = contact_point
assert contact_point.id == email_uri


def test_properties():
Expand Down
54 changes: 54 additions & 0 deletions test/test_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import io
import pytest
import os
import uuid
import zipfile
from itertools import product
Expand Down Expand Up @@ -160,6 +161,59 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip):
assert "sdDatePublished" in props


def test_file_uri(tmpdir):
f_name = uuid.uuid4().hex
f_path = (tmpdir / f_name).resolve()
f_uri = f"file:///{f_path}" # extra slash needed on some windows systems
with open(f_path, "wt") as f:
f.write("FOO\n")
crate = ROCrate()
f_entity = crate.add_file(f_uri, fetch_remote=True)
assert f_entity.id == f_name

out_path = tmpdir / 'ro_crate_out'
crate.write(out_path)

out_crate = ROCrate(out_path)
assert out_crate.dereference(f_name) is not None
assert (out_path / f_name).is_file()


@pytest.mark.skipif(os.name != "posix", reason="':' not allowed in dir name")
def test_looks_like_file_uri(tmpdir, monkeypatch):
f_name = uuid.uuid4().hex
f_parent = (tmpdir / "file:")
f_parent.mkdir()
f_path = f_parent / f_name
with open(f_path, "wt") as f:
f.write("FOO\n")
monkeypatch.chdir(tmpdir)
crate = ROCrate()
# Missing if interpreted as URI, present if intepreted as path
uri = f"file:/{f_name}"
entity = crate.add_file(uri, fetch_remote=False)
assert entity.id == uri

out_path = tmpdir / 'ro_crate_out'
crate.write(out_path)

out_crate = ROCrate(out_path)
assert out_crate.dereference(uri) is not None
assert not (out_path / f_name).is_file()
assert not (out_path / uri).is_file()

# Check that the file can be added to the crate using its absolute path
entity = crate.add_file(f_path.resolve())
assert entity.id == f_name

out_path = tmpdir / 'ro_crate_out_updated'
crate.write(out_path)

out_crate = ROCrate(out_path)
assert out_crate.dereference(f_name) is not None
assert (out_path / f_name).is_file()


@pytest.mark.slow
@pytest.mark.parametrize("fetch_remote", [False, True])
def test_ftp_uri(tmpdir, fetch_remote):
Expand Down

0 comments on commit 854ba36

Please sign in to comment.