diff --git a/.github/workflows/tests-live.yml b/.github/workflows/tests-live.yml index adb4b976..787bfd36 100644 --- a/.github/workflows/tests-live.yml +++ b/.github/workflows/tests-live.yml @@ -43,3 +43,7 @@ jobs: LIVE_S3_BUCKET: ${{ secrets.LIVE_S3_BUCKET }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + CUSTOM_S3_BUCKET: ${{secrets.CUSTOM_S3_BUCKET}} + CUSTOM_S3_KEY_ID: ${{secrets.CUSTOM_S3_KEY_ID}} + CUSTOM_S3_SECRET_KEY: ${{secrets.CUSTOM_S3_SECRET_KEY}} + CUSTOM_S3_ENDPOINT: ${{secrets.CUSTOM_S3_ENDPOINT}} diff --git a/HISTORY.md b/HISTORY.md index 5bb63c31..37945a78 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,10 +1,14 @@ # cloudpathlib Changelog +## v0.4.1 (unreleased) + +- Added support for custom S3-compatible object stores. This functionality is available via the `endpoint_url` keyword argument when instantiating an `S3Client` instance. See [documentation](https://cloudpathlib.drivendata.org/authentication/#accessing-custom-s3-compatible-object-stores) for more details. ([#138](https://github.com/drivendataorg/cloudpathlib/pull/138) thanks to [@YevheniiSemendiak](https://github.com/YevheniiSemendiak)) + ## v0.4.0 (2021-03-13) - Added rich comparison operator support to cloud paths, which means you can now use them with `sorted`. ([#129](https://github.com/drivendataorg/cloudpathlib/pull/129)) -- Added polymorphic class `AnyPath` which creates a cloud path or `pathlib.Path` instance appropriately for an input filepath. See new [documentation](http://https://cloudpathlib.drivendata.org/anypath-polymorphism/) for details and example usage. ([#130](https://github.com/drivendataorg/cloudpathlib/pull/130)) -- Added integration with [Pydantic](https://pydantic-docs.helpmanual.io/). See new [documentation](http://https://cloudpathlib.drivendata.org/integrations/#pydantic) for details and example usage. ([#130](https://github.com/drivendataorg/cloudpathlib/pull/130)) +- Added polymorphic class `AnyPath` which creates a cloud path or `pathlib.Path` instance appropriately for an input filepath. See new [documentation](https://cloudpathlib.drivendata.org/anypath-polymorphism/) for details and example usage. ([#130](https://github.com/drivendataorg/cloudpathlib/pull/130)) +- Added integration with [Pydantic](https://pydantic-docs.helpmanual.io/). See new [documentation](https://cloudpathlib.drivendata.org/integrations/#pydantic) for details and example usage. ([#130](https://github.com/drivendataorg/cloudpathlib/pull/130)) - Exceptions: ([#131](https://github.com/drivendataorg/cloudpathlib/pull/131)) - Changed all custom `cloudpathlib` exceptions to be located in new `cloudpathlib.exceptions` module. - Changed all custom `cloudpathlib` exceptions to subclass from new base `CloudPathException`. This allows for easy catching of any custom exception from `cloudpathlib`. diff --git a/cloudpathlib/s3/s3client.py b/cloudpathlib/s3/s3client.py index 2f4b35f5..11948987 100644 --- a/cloudpathlib/s3/s3client.py +++ b/cloudpathlib/s3/s3client.py @@ -28,6 +28,7 @@ def __init__( profile_name: Optional[str] = None, boto3_session: Optional["Session"] = None, local_cache_dir: Optional[Union[str, os.PathLike]] = None, + endpoint_url: Optional[str] = None, ): """Class constructor. Sets up a boto3 [`Session`]( https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html). @@ -49,6 +50,8 @@ def __init__( boto3_session (Optional[Session]): An already instantiated boto3 Session. local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache for downloaded files. If None, will use a temporary directory. + endpoint_url (Optional[str]): S3 server endpoint URL to use for the constructed boto3 S3 resource and client. + Parameterize it to access a customly deployed S3-compatible object store such as MinIO, Ceph or any other. """ if boto3_session is not None: self.sess = boto3_session @@ -60,8 +63,8 @@ def __init__( botocore_session=botocore_session, profile_name=profile_name, ) - self.s3 = self.sess.resource("s3") - self.client = self.sess.client("s3") + self.s3 = self.sess.resource("s3", endpoint_url=endpoint_url) + self.client = self.sess.client("s3", endpoint_url=endpoint_url) super().__init__(local_cache_dir=local_cache_dir) diff --git a/docs/docs/authentication.md b/docs/docs/authentication.md index 2ed30fbb..ad0bb51d 100644 --- a/docs/docs/authentication.md +++ b/docs/docs/authentication.md @@ -32,7 +32,7 @@ cloud_path.client All subsequent instances of that service's cloud paths (in the example, all subsequent `S3Path` instances) will reference the same client instance. -You can also explicitly instantiate a client instance. You will need to do so if you want to authenticate using any option other than the environment variables from the table in the previous section. (To see what those options are, check out the API documentation pages linked to in the table above.) You can then use that client instance's cloud path factory method, or pass it into a cloud path instantiation +You can also explicitly instantiate a client instance. You will need to do so if you want to authenticate using any option other than the environment variables from the table in the previous section. (To see what those options are, check out the API documentation pages linked to in the table above.) You can then use that client instance's cloud path factory method, or pass it into a cloud path instantiation. ```python from cloudpathlib import S3Client @@ -59,3 +59,26 @@ If you need a reference to the default client: S3Client.get_default_client() #> ``` + +## Accessing custom S3-compatible object stores +It might happen so that you need to access a customly deployed S3 object store ([MinIO](https://min.io/), [Ceph](https://ceph.io/ceph-storage/object-storage/) or any other). +In such cases, the service endpoint will be different from the AWS object store endpoints (used by default). +To specify a custom endpoint address, you will need to manually instantiate `Client` with the `endpoint_url` parameter, +provinding http/https URL including port. + +```python +from cloudpathlib import S3Client, CloudPath + +# create a client pointing to the endpoint +client = S3Client(endpoint_url="http://my.s3.server:1234") + +# option 1: use the client to create paths +cp1 = client.CloudPath("s3://cloudpathlib-test-bucket/") + +# option 2: pass the client as keyword argument +cp2 = CloudPath("s3://cloudpathlib-test-bucket/", client=client) + +# option3: set this client as the default so it is used in any future paths +client.set_as_default_client() +cp3 = CloudPath("s3://cloudpathlib-test-bucket/") +``` diff --git a/tests/conftest.py b/tests/conftest.py index ae3ccb12..9e6e8f68 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from azure.storage.blob import BlobServiceClient import boto3 +import botocore from dotenv import find_dotenv, load_dotenv from google.cloud import storage as google_storage from pytest_cases import fixture, fixture_union @@ -180,7 +181,8 @@ def s3_rig(request, monkeypatch, assets_dir): if os.getenv("USE_LIVE_CLOUD") == "1": # Set up test assets - bucket = boto3.resource("s3").Bucket(drive) + session = boto3.Session() # Fresh session to ensure isolation + bucket = session.resource("s3").Bucket(drive) test_files = [ f for f in assets_dir.glob("**/*") if f.is_file() and f.name not in UPLOAD_IGNORE_LIST ] @@ -212,6 +214,70 @@ def s3_rig(request, monkeypatch, assets_dir): bucket.objects.filter(Prefix=test_dir).delete() +@fixture() +def custom_s3_rig(request, monkeypatch, assets_dir): + """ + Custom S3 rig used to test the integrations with non-AWS S3-compatible object storages like + - MinIO (https://min.io/) + - CEPH (https://ceph.io/ceph-storage/object-storage/) + - others + """ + drive = os.getenv("CUSTOM_S3_BUCKET", "bucket") + test_dir = create_test_dir_name(request) + custom_endpoint_url = os.getenv("CUSTOM_S3_ENDPOINT", "https://s3.us-west-1.drivendatabws.com") + + if os.getenv("USE_LIVE_CLOUD") == "1": + monkeypatch.setenv("AWS_ACCESS_KEY_ID", os.getenv("CUSTOM_S3_KEY_ID")) + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", os.getenv("CUSTOM_S3_SECRET_KEY")) + + # Upload test assets + session = boto3.Session() # Fresh session to ensure isolation from AWS S3 auth + s3 = session.resource("s3", endpoint_url=custom_endpoint_url) + + # idempotent and our test server on heroku only has ephemeral storage + # so we need to try to create each time + try: + s3.meta.client.head_bucket(Bucket=drive) + except botocore.exceptions.ClientError: + s3.create_bucket(Bucket=drive) + + bucket = s3.Bucket(drive) + + test_files = [ + f for f in assets_dir.glob("**/*") if f.is_file() and f.name not in UPLOAD_IGNORE_LIST + ] + for test_file in test_files: + bucket.upload_file( + str(test_file), + str(f"{test_dir}/{PurePosixPath(test_file.relative_to(assets_dir))}"), + ) + else: + # Mock cloud SDK + monkeypatch.setattr( + cloudpathlib.s3.s3client, + "Session", + mocked_session_class_factory(test_dir), + ) + + rig = CloudProviderTestRig( + path_class=S3Path, client_class=S3Client, drive=drive, test_dir=test_dir + ) + + rig.client_class( + endpoint_url=custom_endpoint_url + ).set_as_default_client() # set default client + + # add flag for custom_s3 rig to skip some tests + rig.is_custom_s3 = True + + yield rig + + rig.client_class._default_client = None # reset default client + + if os.getenv("USE_LIVE_CLOUD") == "1": + bucket.objects.filter(Prefix=test_dir).delete() + + @fixture() def local_azure_rig(request, monkeypatch, assets_dir): drive = os.getenv("LIVE_AZURE_CONTAINER", "container") @@ -294,6 +360,7 @@ def local_s3_rig(request, monkeypatch, assets_dir): azure_rig, gs_rig, s3_rig, + custom_s3_rig, local_azure_rig, local_s3_rig, local_gs_rig, diff --git a/tests/mock_clients/mock_s3.py b/tests/mock_clients/mock_s3.py index aaa93e29..392dda93 100644 --- a/tests/mock_clients/mock_s3.py +++ b/tests/mock_clients/mock_s3.py @@ -29,10 +29,10 @@ def __init__(self, *args, **kwargs): def __del__(self): self.tmp.cleanup() - def resource(self, item): + def resource(self, item, endpoint_url): return MockBoto3Resource(self.tmp_path) - def client(self, item): + def client(self, item, endpoint_url): return MockBoto3Client(self.tmp_path) return MockBoto3Session diff --git a/tests/test_client.py b/tests/test_client.py index 2476296a..96e8b4dd 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -2,7 +2,10 @@ def test_default_client_instantiation(rig): - rig.client_class._default_client = None + if not getattr(rig, "is_custom_s3", False): + # Skip resetting the default client for custom S3 endpoint, but keep the other tests, + # since they're still useful. + rig.client_class._default_client = None # CloudPath dispatch p = CloudPath(f"{rig.cloud_prefix}{rig.drive}/{rig.test_dir}/dir_0/file0_0.txt") diff --git a/tests/test_cloudpath_file_io.py b/tests/test_cloudpath_file_io.py index eeb7781c..1644c2af 100644 --- a/tests/test_cloudpath_file_io.py +++ b/tests/test_cloudpath_file_io.py @@ -61,7 +61,9 @@ def test_file_read_writes(rig, tmp_path): before_touch = datetime.now() sleep(1) p.touch() - assert datetime.fromtimestamp(p.stat().st_mtime) > before_touch + if not getattr(rig, "is_custom_s3", False): + # Our S3Path.touch implementation does not update mod time for MinIO + assert datetime.fromtimestamp(p.stat().st_mtime) > before_touch # no-op p.mkdir()