From afcf4c3e2e88a285c445600d774d444f606a047f Mon Sep 17 00:00:00 2001 From: simleo Date: Mon, 2 May 2022 16:35:31 +0200 Subject: [PATCH 1/6] is_url: allow empty netloc --- rocrate/utils.py | 2 +- test/test_model.py | 8 ++++++++ test/test_write.py | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/rocrate/utils.py b/rocrate/utils.py index 7768804..ad02894 100644 --- a/rocrate/utils.py +++ b/rocrate/utils.py @@ -47,7 +47,7 @@ def as_list(list_or_other): def is_url(string): parts = urlsplit(string) - return all((parts.scheme, parts.netloc, parts.path)) + return all((parts.scheme, parts.path)) def iso_now(): diff --git a/test/test_model.py b/test/test_model.py index 3bdc539..6f2620d 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -151,6 +151,14 @@ def test_contextual_entities_hash(test_data_dir): })) wf["hasPart"] = [step] assert step.id == step_id + email = "jscarberry@example.org" + email_uri = f"mailto:{email}" + contact_point = crate.add(ContextEntity(crate, email_uri, properties={ + "@type": "ContactPoint", + "email": email + })) + crate.root_dataset["contactPoint"] = contact_point + assert contact_point.id == email_uri def test_properties(): diff --git a/test/test_write.py b/test/test_write.py index 8aaefc6..c0f51c2 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -160,6 +160,24 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip): assert "sdDatePublished" in props +def test_file_uri(tmpdir): + f_name = uuid.uuid4().hex + f_path = tmpdir / f_name + f_uri = f"file://{f_path}" + with open(f_path, "wt") as f: + f.write("FOO\n") + crate = ROCrate() + f_entity = crate.add_file(f_uri, fetch_remote=True) + assert f_entity.id == f_name + + out_path = tmpdir / 'ro_crate_out' + crate.write(out_path) + + out_crate = ROCrate(out_path) + assert out_crate.dereference(f_name) is not None + assert (out_path / f_name).is_file() + + @pytest.mark.slow @pytest.mark.parametrize("fetch_remote", [False, True]) def test_ftp_uri(tmpdir, fetch_remote): From b02e3f25b34d689e37c7d8a6597026faac4e0ddc Mon Sep 17 00:00:00 2001 From: simleo Date: Mon, 2 May 2022 17:37:43 +0200 Subject: [PATCH 2/6] fix possible unwanted file creation --- rocrate/model/file.py | 27 ++++++++++++++------------- test/test_write.py | 25 ++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/rocrate/model/file.py b/rocrate/model/file.py index 3a1d30d..263677d 100644 --- a/rocrate/model/file.py +++ b/rocrate/model/file.py @@ -45,19 +45,20 @@ def write(self, base_path): mode = 'w' + ('b' if isinstance(self.source, BytesIO) else 't') with open(out_file_path, mode) as out_file: out_file.write(self.source.getvalue()) - elif is_url(str(self.source)) and (self.fetch_remote or self.validate_url): - with urllib.request.urlopen(self.source) as response: - if self.validate_url: - if isinstance(response, HTTPResponse): - self._jsonld.update({ - 'contentSize': response.getheader('Content-Length'), - 'encodingFormat': response.getheader('Content-Type') - }) - if not self.fetch_remote: - self._jsonld['sdDatePublished'] = iso_now() - if self.fetch_remote: - out_file_path.parent.mkdir(parents=True, exist_ok=True) - urllib.request.urlretrieve(response.url, out_file_path) + elif is_url(str(self.source)): + if self.fetch_remote or self.validate_url: + with urllib.request.urlopen(self.source) as response: + if self.validate_url: + if isinstance(response, HTTPResponse): + self._jsonld.update({ + 'contentSize': response.getheader('Content-Length'), + 'encodingFormat': response.getheader('Content-Type') + }) + if not self.fetch_remote: + self._jsonld['sdDatePublished'] = iso_now() + if self.fetch_remote: + out_file_path.parent.mkdir(parents=True, exist_ok=True) + urllib.request.urlretrieve(response.url, out_file_path) elif os.path.isfile(self.source): out_file_path.parent.mkdir(parents=True, exist_ok=True) if not out_file_path.exists() or not out_file_path.samefile(self.source): diff --git a/test/test_write.py b/test/test_write.py index c0f51c2..d66275a 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -162,7 +162,7 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip): def test_file_uri(tmpdir): f_name = uuid.uuid4().hex - f_path = tmpdir / f_name + f_path = (tmpdir / f_name).resolve() f_uri = f"file://{f_path}" with open(f_path, "wt") as f: f.write("FOO\n") @@ -178,6 +178,29 @@ def test_file_uri(tmpdir): assert (out_path / f_name).is_file() +def test_looks_like_file_uri(tmpdir, monkeypatch): + f_name = uuid.uuid4().hex + f_parent = (tmpdir / "file:") + f_parent.mkdir() + f_path = f_parent / f_name + with open(f_path, "wt") as f: + f.write("FOO\n") + monkeypatch.chdir(tmpdir) + crate = ROCrate() + # Missing if interpreted as URI, present if intepreted as path + uri = f"file:/{f_name}" + entity = crate.add_file(uri, fetch_remote=False) + assert entity.id == uri + + out_path = tmpdir / 'ro_crate_out' + crate.write(out_path) + + out_crate = ROCrate(out_path) + assert out_crate.dereference(uri) is not None + assert not (out_path / f_name).is_file() + assert not (out_path / uri).is_file() + + @pytest.mark.slow @pytest.mark.parametrize("fetch_remote", [False, True]) def test_ftp_uri(tmpdir, fetch_remote): From 17a0737e3f083b8644ea1eed32eca21b520148f7 Mon Sep 17 00:00:00 2001 From: simleo Date: Mon, 2 May 2022 17:46:49 +0200 Subject: [PATCH 3/6] check that paths starting with "file:" can be added --- test/test_write.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/test_write.py b/test/test_write.py index d66275a..c5e4b46 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -200,6 +200,17 @@ def test_looks_like_file_uri(tmpdir, monkeypatch): assert not (out_path / f_name).is_file() assert not (out_path / uri).is_file() + # Check that the file can be added to the crate using its absolute path + entity = crate.add_file(f_path.resolve()) + assert entity.id == f_name + + out_path = tmpdir / 'ro_crate_out_updated' + crate.write(out_path) + + out_crate = ROCrate(out_path) + assert out_crate.dereference(f_name) is not None + assert (out_path / f_name).is_file() + @pytest.mark.slow @pytest.mark.parametrize("fetch_remote", [False, True]) From 36452934b00bff0df34e2f0cc4e383be00b82411 Mon Sep 17 00:00:00 2001 From: simleo Date: Wed, 4 May 2022 16:06:59 +0200 Subject: [PATCH 4/6] windows-specific fixes --- rocrate/utils.py | 2 ++ test/test_write.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/rocrate/utils.py b/rocrate/utils.py index ad02894..010b99a 100644 --- a/rocrate/utils.py +++ b/rocrate/utils.py @@ -47,6 +47,8 @@ def as_list(list_or_other): def is_url(string): parts = urlsplit(string) + if os.name == "nt" and len(parts.scheme) == 1: + return False return all((parts.scheme, parts.path)) diff --git a/test/test_write.py b/test/test_write.py index c5e4b46..d249b8b 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -18,6 +18,7 @@ import io import pytest +import os import uuid import zipfile from itertools import product @@ -163,7 +164,7 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip): def test_file_uri(tmpdir): f_name = uuid.uuid4().hex f_path = (tmpdir / f_name).resolve() - f_uri = f"file://{f_path}" + f_uri = f"file://{f_path.as_posix()}" with open(f_path, "wt") as f: f.write("FOO\n") crate = ROCrate() @@ -178,6 +179,7 @@ def test_file_uri(tmpdir): assert (out_path / f_name).is_file() +@pytest.mark.skipif(os.name != "posix", reason="':' not allowed in dir name") def test_looks_like_file_uri(tmpdir, monkeypatch): f_name = uuid.uuid4().hex f_parent = (tmpdir / "file:") From ffec5d6858fd467dedc4bbbe08fc2b1175121649 Mon Sep 17 00:00:00 2001 From: simleo Date: Wed, 4 May 2022 16:33:23 +0200 Subject: [PATCH 5/6] more tweaking for windows --- test/test_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_write.py b/test/test_write.py index d249b8b..4615c6b 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -164,7 +164,7 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip): def test_file_uri(tmpdir): f_name = uuid.uuid4().hex f_path = (tmpdir / f_name).resolve() - f_uri = f"file://{f_path.as_posix()}" + f_uri = f"file://{f_path.as_posix().split(':', 1)[-1]}" with open(f_path, "wt") as f: f.write("FOO\n") crate = ROCrate() From e5d10ba940c82fc806ac3691b8ff1af25abf587a Mon Sep 17 00:00:00 2001 From: simleo Date: Wed, 4 May 2022 17:29:07 +0200 Subject: [PATCH 6/6] one more for windows --- test/test_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_write.py b/test/test_write.py index 4615c6b..b806ae5 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -164,7 +164,7 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip): def test_file_uri(tmpdir): f_name = uuid.uuid4().hex f_path = (tmpdir / f_name).resolve() - f_uri = f"file://{f_path.as_posix().split(':', 1)[-1]}" + f_uri = f"file:///{f_path}" # extra slash needed on some windows systems with open(f_path, "wt") as f: f.write("FOO\n") crate = ROCrate()