diff --git a/.env.dev b/.env.dev index 9aaaea52..02a8f29c 100644 --- a/.env.dev +++ b/.env.dev @@ -21,7 +21,7 @@ AZURE_STORAGE_ACCOUNT_KEY= # default auth setting, which requires authenticating # via the gcloud cli. -# Rstudio Connect license ---- +# Posit Connect license ---- RSC_LICENSE= # Uncomment and change the variables below to specify the bucket (directory) the buckets diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9575fda2..72025c42 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,13 +20,13 @@ jobs: strategy: fail-fast: false matrix: - python: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python: ["3.9", "3.10", "3.11", "3.12", "3.13"] os: ["ubuntu-latest"] pytest_opts: ["--workers 4 --tests-per-worker 1"] requirements: [""] include: - os: "ubuntu-latest" - python: "3.8" + python: "3.9" requirements: "requirements/minimum.txt" - os: "macos-latest" python: "3.10" @@ -91,8 +91,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -r requirements/dev.txt - python -m pip install rdata - python -m pip install -e .[test] + python -m pip install -e . - name: run Posit Connect run: | @@ -149,7 +148,7 @@ jobs: run: | make docs-build - name: Save docs artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: docs-html path: docs/_site @@ -161,7 +160,7 @@ jobs: if: "${{github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork }}" steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: docs-html path: docs/_site @@ -220,7 +219,7 @@ jobs: needs: ["build-docs", "tests", "test-rsconnect"] if: github.ref == 'refs/heads/main' steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: docs-html path: docs/_site diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 59d01343..3774dd86 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -24,7 +24,7 @@ jobs: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: 3.12 # Use the maximum version supported by python-pins + python-version: 3.13 # Use the maximum version supported by python-pins - name: Install dependencies shell: bash run: | @@ -32,4 +32,4 @@ jobs: python -m pip install -e .[check] - uses: jakebailey/pyright-action@v2 with: - version: 1.1.372 # Manually sync with setup.cfg + version: 1.1.372 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 127485a2..1faadb7e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,7 +44,7 @@ Tests can be run using pytest: ```shell pytest pins -# run all tests except those for Rstudio Connect +# run all tests except those for Posit Connect pytest pins -m 'not fs_rsc' # run only local filesystem backend tests @@ -60,7 +60,7 @@ pytest pins -m 'not skip_on_github' -k 'not pins.boards.BoardManual' There are two important details to note for testing: -* **Backends**. pins can write to backends like s3, azure, and RStudio Connect, so you +* **Backends**. pins can write to backends like s3, azure, and Posit Connect, so you will need to set credentials to test against them. * **Pytest Marks**. You can disable tests over a specific backend through pytest's `-m` flag. For example... @@ -74,7 +74,7 @@ There are two important details to note for testing: * Modify `.env` to file in environment variables (e.g. AWS_ACCESS_KEY_ID) * Be careful not to put any sensitive information in `.env.dev`! -### Setting up RStudio Connect tests +### Setting up Posit Connect tests ``` # Be sure to set RSC_LICENSE in .env diff --git a/Makefile b/Makefile index b46883fc..10fbf703 100644 --- a/Makefile +++ b/Makefile @@ -38,10 +38,10 @@ docs-build: docs-clean: rm -rf docs/_build docs/api/api_card -requirements/dev.txt: setup.cfg +requirements/dev.txt: pyproject.toml @# allows you to do this... @# make requirements | tee > requirements/some_file.txt - @pip-compile setup.cfg --rebuild --extra doc --extra test --extra check --output-file=- > $@ + @pip-compile pyproject.toml --rebuild --extra doc --extra test --extra check --output-file=- > $@ binder/requirements.txt: requirements/dev.txt cp $< $@ diff --git a/docs/get_started.qmd b/docs/get_started.qmd index 2f658d7f..6ad58b75 100644 --- a/docs/get_started.qmd +++ b/docs/get_started.qmd @@ -4,7 +4,8 @@ jupyter: python3 --- ```{python} -#| include: false +# | include: false +import time import pandas as pd pd.options.display.max_rows = 25 ``` @@ -126,7 +127,7 @@ While we’ll do our best to keep the automatically generated metadata consisten ## Versioning -Every [](`~pins.boards.BaseBoard.pin_write`) will create a new version: +By default, calls to [](`~pins.boards.BaseBoard.pin_write`) will usually create a new version: ```{python} board2 = board_temp() @@ -136,6 +137,23 @@ board2.pin_write([1,2], name = "x", type = "json") board2.pin_versions("x") ``` +The only exception is if the data is identical with the most recent version (compared via file hash): + +```{python} +board2.pin_write([1], name = "x", type = "json") +time.sleep(1.1) # later, let's try and write a new version of the same data... +board2.pin_write([1], name = "x", type = "json") +board2.pin_versions("x") +``` + + +However you can opt-out of this behaviour with `force_identical_write=True`: +```{python} +time.sleep(1.1) # try again... +board2.pin_write([1], name = "x", type = "json", force_identical_write=True) +board2.pin_versions("x") +``` + By default, [](`~pins.boards.BaseBoard.pin_read`) will return the most recent version: ```{python} diff --git a/pins/boards.py b/pins/boards.py index 503e8fbd..2bc62803 100644 --- a/pins/boards.py +++ b/pins/boards.py @@ -6,17 +6,19 @@ import re import shutil import tempfile +from collections.abc import Mapping, Sequence from datetime import datetime, timedelta from io import IOBase from pathlib import Path -from typing import Mapping, Protocol, Sequence +from typing import Protocol from importlib_resources import files +from importlib_resources.abc import Traversable from ._adaptors import _Adaptor, _create_adaptor from .cache import PinsCache from .config import get_allow_rsc_short_name -from .drivers import default_title, load_data, load_file, save_data +from .drivers import REQUIRES_SINGLE_FILE, default_title, load_data, load_file, save_data from .errors import PinsError, PinsVersionError from .meta import Meta, MetaFactory, MetaRaw from .utils import ExtendMethodDoc, inform, warn_deprecated @@ -228,36 +230,55 @@ def _pin_store( metadata: Mapping | None = None, versioned: bool | None = None, created: datetime | None = None, + *, + force_identical_write: bool = False, ) -> Meta: - if type == "feather": + _type = type + if _type == "feather": warn_deprecated( 'Writing pin type "feather" is unsupported. Switching type to "arrow".' " This produces the exact same behavior, and also works with R pins." ' Please switch to pin_write using type="arrow".' ) - type = "arrow" + _type = "arrow" - if type == "file": + if _type == "file": # the file type makes the name of the data the exact filename, rather # than the pin name + a suffix (e.g. my_pin.csv). if isinstance(x, (tuple, list)) and len(x) == 1: x = x[0] - _p = Path(x) - _base_len = len(_p.name) - len("".join(_p.suffixes)) - object_name = _p.name[:_base_len] + if not isinstance(x, (list, tuple)): + _p = Path(x) + _base_len = len(_p.name) - len("".join(_p.suffixes)) + object_name = _p.name[:_base_len] + else: + # multifile upload, keep list of filenames + object_name = [] + for file in x: + _p = Path(file) + # _base_len = len(_p.name) - len("".join(_p.suffixes)) + object_name.append(_p.name) # [:_base_len]) else: object_name = None pin_name = self.path_to_pin(name) + # Pre-emptively fetch the most recent pin's meta if it exists - this is used + # for the force_identical_write check + abort_if_identical = not force_identical_write and self.pin_exists(name) + if abort_if_identical: + last_meta = self.pin_meta(name) + with tempfile.TemporaryDirectory() as tmp_dir: - # create all pin data (e.g. data.txt, save object) + # create all pin data (e.g. data.txt, save object) to get the metadata. + # For unversioned boards, this also will delete the most recent pin version, + # ready for it to be replaced with a new one. meta = self.prepare_pin_version( tmp_dir, x, pin_name, - type, + _type, title, description, metadata, @@ -266,6 +287,18 @@ def _pin_store( object_name=object_name, ) + # force_identical_write check + if abort_if_identical: + last_hash = last_meta.pin_hash + + if last_hash == meta.pin_hash: + msg = ( + f'The hash of pin "{name}" has not changed. Your pin will not ' + f"be stored.", + ) + inform(log=_log, msg=msg) + return last_meta + # move pin to destination ---- # create pin version folder dst_pin_path = self.construct_path([pin_name]) @@ -313,6 +346,8 @@ def pin_write( metadata: Mapping | None = None, versioned: bool | None = None, created: datetime | None = None, + *, + force_identical_write: bool = False, ) -> Meta: """Write a pin object to the board. @@ -339,6 +374,17 @@ def pin_write( created: A date to store in the Meta.created field. This field may be used as part of the pin version name. + force_identical_write: + Store the pin even if the pin contents are identical to the last version + (compared using the hash). Only the pin contents are compared, not the pin + metadata. Defaults to False. + + Returns + ------- + Meta: + Metadata about the stored pin. If `force_identical_write` is False and the + pin contents are identical to the last version, the last version's metadata + is returned. """ if type == "file": @@ -348,7 +394,15 @@ def pin_write( ) return self._pin_store( - x, name, type, title, description, metadata, versioned, created + x, + name, + type, + title, + description, + metadata, + versioned, + created, + force_identical_write=force_identical_write, ) def pin_download(self, name, version=None, hash=None) -> Sequence[str]: @@ -374,20 +428,32 @@ def pin_download(self, name, version=None, hash=None) -> Sequence[str]: if hash is not None: raise NotImplementedError("TODO: validate hash") + fnames = [meta.file] if isinstance(meta.file, str) else meta.file + pin_type = meta.type + + if len(fnames) > 1 and pin_type in REQUIRES_SINGLE_FILE: + raise ValueError("Cannot load data when more than 1 file") + pin_name = self.path_to_pin(name) + files = [] + + for fname in fnames: + # fetch file + with load_file( + fname, + self.fs, + self.construct_path([pin_name, meta.version.version]), + pin_type, + ) as f: + # could also check whether f isinstance of PinCache + fname = getattr(f, "name", None) - # TODO: raise for multiple files - # fetch file - with load_file( - meta, self.fs, self.construct_path([pin_name, meta.version.version]) - ) as f: - # could also check whether f isinstance of PinCache - fname = getattr(f, "name", None) + if fname is None: + raise PinsError("pin_download requires a cache.") - if fname is None: - raise PinsError("pin_download requires a cache.") + files.append(str(Path(fname).absolute())) - return [str(Path(fname).absolute())] + return files def pin_upload( self, @@ -420,6 +486,12 @@ def pin_upload( This gets stored on the Meta.user field. """ + if isinstance(paths, (list, tuple)): + # check if all paths exist + for path in paths: + if not Path(path).is_file(): + raise PinsError(f"Path is not a valid file: {path}") + return self._pin_store( paths, name, @@ -478,7 +550,8 @@ def pin_versions_prune(self, name, n: int | None = None, days: int | None = None raise ValueError("Argument days is {days}, but must be greater than 0.") date_cutoff = datetime.today() - timedelta(days=days) - to_delete = [v for v in versions if v.created < date_cutoff] + # Avoid deleting the most recent version + to_delete = [v for v in versions[:-1] if v.created < date_cutoff] # message user about deletions ---- # TODO(question): how to pin_inform? Log or warning? @@ -623,7 +696,7 @@ def prepare_pin_version( metadata: Mapping | None = None, versioned: bool | None = None, created: datetime | None = None, - object_name: str | None = None, + object_name: str | list[str] | None = None, ): x = _create_adaptor(x) @@ -670,14 +743,18 @@ def _create_meta( # create metadata from object on disk --------------------------------- # save all pin data to a temporary folder (including data.txt), so we # can fs.put it all straight onto the backend filesystem - - if object_name is None: - p_obj = Path(pin_dir_path) / name + apply_suffix = True + if isinstance(object_name, (list, tuple)): + apply_suffix = False + p_obj = [] + for obj in object_name: + p_obj.append(str(Path(pin_dir_path) / obj)) + elif object_name is None: + p_obj = str(Path(pin_dir_path) / name) else: - p_obj = Path(pin_dir_path) / object_name - + p_obj = str(Path(pin_dir_path) / object_name) # file is saved locally in order to hash, calc size - file_names = save_data(x._d, str(p_obj), type) + file_names = save_data(x._d, p_obj, type, apply_suffix) meta = self.meta_factory.create( pin_dir_path, @@ -870,7 +947,7 @@ def pin_download(self, name, version=None, hash=None) -> Sequence[str]: meta = self.pin_meta(name, version) if isinstance(meta, MetaRaw): - f = load_file(meta, self.fs, None) + f = load_file(meta.file, self.fs, None, meta.type) else: raise NotImplementedError( "TODO: pin_download currently can only read a url to a single file." @@ -924,14 +1001,14 @@ class BoardRsConnect(BaseBoard): # TODO: note that board is unused in this class (e.g. it's not in construct_path()) # TODO: should read template dynamically, not at class def'n time - html_assets_dir: Path = files("pins") / "rsconnect/html" - html_template: Path = files("pins") / "rsconnect/html/index.html" + html_assets_dir: Traversable = files("pins") / "rsconnect/html" + html_template: Traversable = files("pins") / "rsconnect/html/index.html" # defaults work ---- @ExtendMethodDoc def pin_list(self): - # lists all pin content on RStudio Connect server + # lists all pin content on Posit Connect server # we can't use fs.ls, because it will list *all content* paged_res = self.fs.api.misc_get_applications("content_type:pin") results = paged_res.results @@ -1056,14 +1133,14 @@ def pin_version_delete(self, *args, **kwargs): if e.args[0]["code"] != 75: raise e - raise PinsError("RStudio Connect cannot delete the latest pin version.") + raise PinsError("Posit Connect cannot delete the latest pin version.") @ExtendMethodDoc def pin_versions_prune(self, *args, **kwargs): sig = inspect.signature(super().pin_versions_prune) if sig.bind(*args, **kwargs).arguments.get("days") is not None: raise NotImplementedError( - "RStudio Connect board cannot prune versions using days." + "Posit Connect board cannot prune versions using days." ) super().pin_versions_prune(*args, **kwargs) @@ -1090,7 +1167,7 @@ def validate_pin_name(self, name) -> None: if not get_allow_rsc_short_name() and name.count("/") != 1: raise ValueError( f"Invalid pin name: {name}" - "\nRStudio Connect pin names must include user name. E.g. " + "\nPosit Connect pin names must include user name. E.g. " "\nsome_user/mtcars, for the user some_user." ) diff --git a/pins/cache.py b/pins/cache.py index cadda23c..ebffb3b7 100644 --- a/pins/cache.py +++ b/pins/cache.py @@ -1,8 +1,11 @@ +from __future__ import annotations + import logging import os import shutil import time import urllib.parse +from collections.abc import Iterator from pathlib import Path import humanize @@ -20,7 +23,7 @@ PLACEHOLDER_FILE = "file" -def touch_access_time(path, access_time: "float | None" = None, strict=True): +def touch_access_time(path, access_time: float | None = None, strict=True): """Update access time of file. Returns the new access time. @@ -65,8 +68,8 @@ def __call__(self, path: str) -> str: if self.hash_prefix is not None: # optionally make the name relative to a parent path # using the hash of parent path as a prefix, to flatten a bit - hash = Path(path).relative_to(Path(self.hash_prefix)) - return hash + _hash = Path(path).relative_to(Path(self.hash_prefix)) + return _hash else: raise NotImplementedError() @@ -99,8 +102,8 @@ def __call__(self, path): # the main change in this function is that, for same_name, it returns # the full path # change pin path of form / to + - hash = path.replace("/", "+", 1) - return hash + _hash = path.replace("/", "+", 1) + return _hash class PinsCache(SimpleCacheFileSystem): @@ -220,15 +223,15 @@ class CachePruner: meta_path = "data.txt" - def __init__(self, cache_dir: "str | Path"): + def __init__(self, cache_dir: str | Path): self.cache_dir = Path(cache_dir) - def versions(self) -> "iter[Path]": + def versions(self) -> Iterator[Path]: for p_version in self.cache_dir.glob("*/*"): if p_version.is_dir() and (p_version / self.meta_path).exists(): yield p_version - def should_prune_version(self, days, path: "str | Path"): + def should_prune_version(self, days, path: str | Path): path = Path(path) expiry_time_sec = days * 60 * 60 * 24 @@ -258,7 +261,7 @@ def prune(self, days=30): _log.info("Skipping cache deletion") -def delete_version(path: "str | Path"): +def delete_version(path: str | Path): path = Path(path) shutil.rmtree(str(path.absolute())) diff --git a/pins/constructors.py b/pins/constructors.py index 5db48450..6cb5d117 100644 --- a/pins/constructors.py +++ b/pins/constructors.py @@ -1,6 +1,9 @@ +from __future__ import annotations + import os import tempfile import warnings +from typing import Callable import fsspec @@ -8,8 +11,9 @@ from .cache import PinsAccessTimeCache, PinsCache, PinsRscCacheMapper, prefix_cache from .config import get_cache_dir, get_data_dir -# Kept here for backward-compatibility reasons. -board_deparse # Note that this is not a constructor, but a function to represent them. +# Kept here for backward-compatibility reasons +# Note that this is not a constructor, but a function to represent them. +board_deparse # pyright: ignore[reportUnusedExpression] class DEFAULT: @@ -25,10 +29,10 @@ def board( protocol: str, path: str = "", versioned: bool = True, - cache: "type[DEFAULT] | None" = DEFAULT, + cache: type[DEFAULT] | None = DEFAULT, allow_pickle_read=None, - storage_options: "dict | None" = None, - board_factory: "callable | type[BaseBoard] | None" = None, + storage_options: dict | None = None, + board_factory: Callable | type[BaseBoard] | None = None, ): """General function for constructing a pins board. diff --git a/pins/drivers.py b/pins/drivers.py index 49fb4388..aa961f53 100644 --- a/pins/drivers.py +++ b/pins/drivers.py @@ -1,5 +1,6 @@ +from collections.abc import Sequence from pathlib import Path -from typing import Any, Sequence +from typing import Any from pins._adaptors import _create_adaptor @@ -12,35 +13,27 @@ UNSAFE_TYPES = frozenset(["joblib"]) -REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib", "file"]) +REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib"]) -def load_path(meta, path_to_version): - # Check that only a single file name was given - fnames = [meta.file] if isinstance(meta.file, str) else meta.file - if len(fnames) > 1 and type in REQUIRES_SINGLE_FILE: - raise ValueError("Cannot load data when more than 1 file") - +def load_path(filename: str, path_to_version, pin_type=None): # file path creation ------------------------------------------------------ - - if type == "table": # noqa: E721 False Positive due to bug: https://github.com/rstudio/pins-python/issues/266 + if pin_type == "table": # this type contains an rds and csv files named data.{ext}, so we match # R pins behavior and hardcode the name - target_fname = "data.csv" - else: - target_fname = fnames[0] + filename = "data.csv" if path_to_version is not None: - path_to_file = f"{path_to_version}/{target_fname}" + path_to_file = f"{path_to_version}/{filename}" else: # BoardUrl doesn't have versions, and the file is the full url - path_to_file = target_fname + path_to_file = filename return path_to_file -def load_file(meta: Meta, fs, path_to_version): - return fs.open(load_path(meta, path_to_version)) +def load_file(filename: str, fs, path_to_version, pin_type): + return fs.open(load_path(filename, path_to_version, pin_type)) def load_data( @@ -71,7 +64,7 @@ def load_data( " * https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations" ) - with load_file(meta, fs, path_to_version) as f: + with load_file(meta.file, fs, path_to_version, meta.type) as f: if meta.type == "csv": import pandas as pd @@ -115,7 +108,7 @@ def load_data( elif meta.type == "rds": try: - import rdata + import rdata # pyright: ignore[reportMissingImports] return rdata.read_rds(f) except ModuleNotFoundError: @@ -126,7 +119,9 @@ def load_data( raise NotImplementedError(f"No driver for type {meta.type}") -def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequence[str]": +def save_data( + obj, fname, pin_type=None, apply_suffix: bool = True +) -> "str | Sequence[str]": # TODO: extensible saving with deferred importing # TODO: how to encode arguments to saving / loading drivers? # e.g. pandas index options @@ -137,41 +132,51 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen adaptor = _create_adaptor(obj) if apply_suffix: - if type == "file": + if pin_type == "file": suffix = "".join(Path(obj).suffixes) else: - suffix = f".{type}" + suffix = f".{pin_type}" else: suffix = "" - final_name = f"{fname}{suffix}" + if isinstance(fname, list): + final_name = fname + else: + final_name = f"{fname}{suffix}" - if type == "csv": + if pin_type == "csv": adaptor.write_csv(final_name) - elif type == "arrow": + elif pin_type == "arrow": # NOTE: R pins accepts the type arrow, and saves it as feather. # we allow reading this type, but raise an error for writing. adaptor.write_feather(final_name) - elif type == "feather": + elif pin_type == "feather": msg = ( 'Saving data as type "feather" no longer supported. Use type "arrow" instead.' ) raise NotImplementedError(msg) - elif type == "parquet": + elif pin_type == "parquet": adaptor.write_parquet(final_name) - elif type == "joblib": + elif pin_type == "joblib": adaptor.write_joblib(final_name) - elif type == "json": + elif pin_type == "json": adaptor.write_json(final_name) - elif type == "file": + elif pin_type == "file": import contextlib import shutil + if isinstance(obj, list): + for file, final in zip(obj, final_name): + with contextlib.suppress(shutil.SameFileError): + shutil.copyfile(str(file), final) + return obj # ignore the case where the source is the same as the target - with contextlib.suppress(shutil.SameFileError): - shutil.copyfile(str(obj), final_name) + else: + with contextlib.suppress(shutil.SameFileError): + shutil.copyfile(str(obj), final_name) + else: - raise NotImplementedError(f"Cannot save type: {type}") + raise NotImplementedError(f"Cannot save type: {pin_type}") return final_name diff --git a/pins/meta.py b/pins/meta.py index 0fab8444..e58555bd 100644 --- a/pins/meta.py +++ b/pins/meta.py @@ -1,8 +1,9 @@ from __future__ import annotations +from collections.abc import Mapping, Sequence from dataclasses import InitVar, asdict, dataclass, field, fields from pathlib import Path -from typing import ClassVar, Mapping, Sequence +from typing import Any, ClassVar import yaml @@ -61,7 +62,7 @@ class Meta: A dictionary of additional metadata that may be specified by the user. local: A dictionary of additional metadata that may be added by the board, depending - on the backend used. E.g. RStudio Connect content id, url, etc.. + on the backend used. E.g. Posit Connect content id, url, etc.. """ @@ -104,7 +105,7 @@ def __getattr__(self, k): except KeyError: raise AttributeError(f"No metadata field not found: {k}") - def to_dict(self) -> Mapping: + def to_dict(self) -> dict[str, Any]: data = asdict(self) return data @@ -245,7 +246,12 @@ def create( raise NotImplementedError("Cannot create from file object.") else: - raise NotImplementedError("TODO: creating meta from multiple files") + if isinstance(files, (list, tuple)): + from pathlib import Path + + file_name = [Path(f).name for f in files] + file_size = [Path(f).stat().st_size for f in files] + version = Version.from_files(files, created) return Meta( title=title, diff --git a/pins/rsconnect/api.py b/pins/rsconnect/api.py index 8b4b888c..23af7c70 100644 --- a/pins/rsconnect/api.py +++ b/pins/rsconnect/api.py @@ -1,12 +1,14 @@ +from __future__ import annotations + import logging import os import tempfile -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from dataclasses import dataclass from functools import partial from io import IOBase from pathlib import Path -from typing import Generic, Sequence, TypeVar +from typing import Generic, Literal, TypeVar, overload from urllib.parse import urlencode import requests @@ -130,14 +132,14 @@ def __init__(self, results: T, cursor: Mapping): class RsConnectApi: - api_key: "str | None" - server_url: "str" + api_key: str | None + server_url: str def __init__( self, - server_url: "str | None", - api_key: "str | None" = None, - session: "requests.Session | None" = None, + server_url: str | None, + api_key: str | None = None, + session: requests.Session | None = None, ): self.server_url = server_url self.api_key = api_key @@ -182,7 +184,7 @@ def _get_headers(self): return {**d_key, **d_rsc} - def _validate_json_response(self, data: "dict | list"): + def _validate_json_response(self, data: dict | list): if isinstance(data, list): return @@ -213,7 +215,15 @@ def query(self, route, method="GET", return_request=False, **kwargs): return self._raw_query(endpoint, method, return_request, **kwargs) - def _raw_query(self, url, method="GET", return_request=False, **kwargs): + @overload + def _raw_query( + self, url, method, return_request: Literal[True], **kwargs + ) -> requests.Response: ... + @overload + def _raw_query( + self, url, method, return_request: Literal[False], **kwargs + ) -> dict | list: ... + def _raw_query(self, url, method="GET", return_request: bool = False, **kwargs): if "headers" in kwargs: raise KeyError("cannot specify headers param in kwargs") @@ -232,8 +242,9 @@ def _raw_query(self, url, method="GET", return_request=False, **kwargs): data = r.json() self._validate_json_response(data) return data - except requests.JSONDecodeError: + except requests.JSONDecodeError as err: r.raise_for_status() + raise err # Fallback if somehow there was no HTTPError def walk_paginated_offsets(self, f_query, endpoint, method, params=None, **kwargs): if params is None: @@ -257,23 +268,23 @@ def walk_paginated_offsets(self, f_query, endpoint, method, params=None, **kwarg # users ---- - def get_user(self, guid: str = None) -> User: + def get_user(self, guid: str | None = None) -> User: if guid is None: return User(self.query_v1("user")) - result = self.query_v1(f"user/{guid}") + result = self.query_v1(f"users/{guid}") return User(result) def get_users( self, - prefix: "str | None" = None, - user_role: "str | None" = None, - account_status: "str | None" = None, - page_number: "int | None" = None, - page_size: "int | None" = None, - asc_order: "bool | None" = None, + prefix: str | None = None, + user_role: str | None = None, + account_status: str | None = None, + page_number: int | None = None, + page_size: int | None = None, + asc_order: bool | None = None, walk_pages=True, - ) -> "Sequence[User] | Sequence[dict]": + ) -> Sequence[User] | Sequence[dict]: params = {k: v for k, v in locals().items() if k != "self" if v is not None} if walk_pages: @@ -303,7 +314,7 @@ def post_content_item( return Content(result) - def post_content_item_deploy(self, guid: str, bundle_id: "str | None" = None): + def post_content_item_deploy(self, guid: str, bundle_id: str | None = None): json = {"bundle_id": bundle_id} if bundle_id is not None else {} return self.query_v1(f"content/{guid}/deploy", "POST", json=json) @@ -345,9 +356,7 @@ def get_content_bundle(self, guid: str, id: int) -> Bundle: result = self.query_v1(f"content/{guid}/bundles/{id}") return Bundle(result) - def get_content_bundle_archive( - self, guid: str, id: str, f_obj: "str | IOBase" - ) -> None: + def get_content_bundle_archive(self, guid: str, id: str, f_obj: str | IOBase) -> None: r = self.query_v1( f"content/{guid}/bundles/{id}/download", stream=True, return_request=True ) @@ -399,7 +408,7 @@ def misc_ping(self): return self._raw_query(f"{self.server_url}/__ping__") def misc_get_content_bundle_file( - self, guid: str, id: str, fname: str, f_obj: "str | IOBase | None" = None + self, guid: str, id: str, fname: str, f_obj: str | IOBase | None = None ): if f_obj is None: f_obj = fname @@ -431,7 +440,7 @@ class _HackyConnect(RsConnectApi): """Handles logging in to connect, rather than using an API key. This class allows you to create users and generate API keys on a fresh - RStudio Connect service. + Posit Connect service. """ def login(self, user, password): diff --git a/pins/rsconnect/fs.py b/pins/rsconnect/fs.py index 853443a2..4855892c 100644 --- a/pins/rsconnect/fs.py +++ b/pins/rsconnect/fs.py @@ -1,6 +1,9 @@ +from __future__ import annotations + +from collections.abc import Sequence from dataclasses import asdict, dataclass, field, fields from pathlib import Path -from typing import Sequence +from typing import ClassVar from fsspec import AbstractFileSystem @@ -33,7 +36,7 @@ def _not_impl_args_kwargs(args, kwargs): @dataclass class PinBundleManifestMetadata: appmode: str = "static" - primary_rmd: "str|None" = None + primary_rmd: str | None = None primary_html: str = "index.html" content_category: str = "pin" has_parameters: bool = False @@ -59,7 +62,7 @@ def from_directory(cls, dir_name, recursive: bool = True): return cls(files=flat_rel_files) @classmethod - def add_manifest_to_directory(cls, dir_name: "str | Path", **kwargs) -> None: + def add_manifest_to_directory(cls, dir_name: str | Path, **kwargs) -> None: import json # TODO(question): R lib uses RSCONNECT_TAR env variable @@ -105,7 +108,7 @@ class BundleFilePath(BundlePath): class RsConnectFs(AbstractFileSystem): - protocol: str = "rsc" + protocol: ClassVar[str | tuple[str, ...]] = "rsc" def __init__(self, server_url, **kwargs): if isinstance(server_url, RsConnectApi): @@ -116,8 +119,8 @@ def __init__(self, server_url, **kwargs): self._user_name_cache = {} self._content_name_cache = {} - def ls(self, path, details=False, **kwargs) -> "Sequence[BaseEntity] | Sequence[str]": - """List contents of Rstudio Connect Server. + def ls(self, path, details=False, **kwargs) -> Sequence[BaseEntity] | Sequence[str]: + """List contents of Posit Connect Server. Parameters ---------- @@ -163,7 +166,7 @@ def put( cls_manifest=PinBundleManifest, **kwargs, ) -> None: - """Put a bundle onto Rstudio Connect. + """Put a bundle onto Posit Connect. Parameters ---------- @@ -224,7 +227,7 @@ def put( return f"{rpath}/{bundle['id']}" def open(self, path: str, mode: str = "rb", *args, **kwargs): - """Open a file inside an RStudio Connect bundle.""" + """Open a file inside an Posit Connect bundle.""" if mode != "rb": raise NotImplementedError() @@ -253,7 +256,7 @@ def open(self, path: str, mode: str = "rb", *args, **kwargs): return f def get(self, rpath, lpath, recursive=False, *args, **kwargs) -> None: - """Fetch a bundle or file from RStudio Connect.""" + """Fetch a bundle or file from Posit Connect.""" parsed = self.parse_path(rpath) if recursive: @@ -303,7 +306,7 @@ def mkdir( # TODO: could implement and call makedirs, but seems overkill self.api.post_content_item(parsed.content, access_type, **kwargs) - def info(self, path, **kwargs) -> "User | Content | Bundle": + def info(self, path, **kwargs) -> User | Content | Bundle: # TODO: source of fsspec info uses self._parent to check cache? # S3 encodes refresh (for local cache) and version_id arguments diff --git a/pins/tests/test_boards.py b/pins/tests/test_boards.py index a693a827..4afeecc0 100644 --- a/pins/tests/test_boards.py +++ b/pins/tests/test_boards.py @@ -136,6 +136,40 @@ def test_board_pin_write_file_raises_error(board, tmp_path): board.pin_write(path, "cool_pin", type="file") +@pytest.mark.parametrize("force_identical_write", [True, False]) +def test_board_pin_write_force_identical_write_pincount(board, force_identical_write): + df = pd.DataFrame({"x": [1, 2, 3]}) + + # 1min ago to avoid name collision + one_min_ago = datetime.now() - timedelta(minutes=1) + board.pin_write(df, "cool_pin", type="csv", created=one_min_ago) + board.pin_write( + df, "cool_pin", type="csv", force_identical_write=force_identical_write + ) + versions = board.pin_versions("cool_pin") + if force_identical_write: + assert len(versions) == 2 + else: + assert len(versions) == 1 + + +def test_board_pin_write_force_identical_write_msg( + board, capfd: pytest.CaptureFixture[str] +): + df = pd.DataFrame({"x": [1, 2, 3]}) + + # 1min ago to avoid name collision + one_min_ago = datetime.now() - timedelta(minutes=1) + board.pin_write(df, "cool_pin", type="csv", created=one_min_ago) + board.pin_write(df, "cool_pin", type="csv") + versions = board.pin_versions("cool_pin") + + _, err = capfd.readouterr() + msg = 'The hash of pin "cool_pin" has not changed. Your pin will not be stored.' + assert msg in err + assert len(versions) == 1 + + def test_board_pin_download(board_with_cache, tmp_path): # create and save data df = pd.DataFrame({"x": [1, 2, 3]}) @@ -232,6 +266,26 @@ def test_board_pin_upload_path_list(board_with_cache, tmp_path): (pin_path,) = board_with_cache.pin_download("cool_pin") +def test_board_pin_download_filename_multifile(board_with_cache, tmp_path): + # create and save data + df = pd.DataFrame({"x": [1, 2, 3]}) + + path1, path2 = tmp_path / "data1.csv", tmp_path / "data2.csv" + df.to_csv(path1, index=False) + df.to_csv(path2, index=False) + + meta = board_with_cache.pin_upload([path1, path2], "cool_pin") + + assert meta.type == "file" + assert meta.file == ["data1.csv", "data2.csv"] + + pin_path = board_with_cache.pin_download("cool_pin") + + assert len(pin_path) == 2 + assert Path(pin_path[0]).name == "data1.csv" + assert Path(pin_path[1]).name == "data2.csv" + + def test_board_pin_write_rsc_index_html(board, tmp_path: Path, snapshot): if board.fs.protocol != "rsc": pytest.skip() @@ -309,6 +363,32 @@ def test_board_pin_read_insecure_succeed_board_flag(board): # pin_write with unversioned boards =========================================== +@pytest.mark.parametrize("versioned", [None, False]) +def test_board_unversioned_pin_write_unversioned_force_identical_write( + versioned, board_unversioned +): + # 1min ago to avoid name collision + one_min_ago = datetime.now() - timedelta(minutes=1) + board_unversioned.pin_write( + {"a": 1}, + "test_pin", + type="json", + versioned=versioned, + created=one_min_ago, + force_identical_write=True, + ) + board_unversioned.pin_write( + {"a": 2}, + "test_pin", + type="json", + versioned=versioned, + force_identical_write=True, + ) + + assert len(board_unversioned.pin_versions("test_pin")) == 1 + assert board_unversioned.pin_read("test_pin") == {"a": 2} + + @pytest.mark.parametrize("versioned", [None, False]) def test_board_unversioned_pin_write_unversioned(versioned, board_unversioned): board_unversioned.pin_write({"a": 1}, "test_pin", type="json", versioned=versioned) @@ -346,9 +426,14 @@ def pin_name(): @pytest.fixture def pin_del(board, df, pin_name): - meta_old = board.pin_write(df, pin_name, type="csv", title="some title") - sleep(1) - meta_new = board.pin_write(df, pin_name, type="csv", title="some title") + # 1min ago to avoid name collision + one_min_ago = datetime.now() - timedelta(minutes=1) + meta_old = board.pin_write( + df, pin_name, type="csv", title="some title", created=one_min_ago + ) + meta_new = board.pin_write( + df, pin_name, type="csv", title="some title", force_identical_write=True + ) assert len(board.pin_versions(pin_name)) == 2 assert meta_old.version.version != meta_new.version.version @@ -363,8 +448,22 @@ def pin_prune(board, df, pin_name): two_days_ago = today - timedelta(days=2, minutes=1) board.pin_write(df, pin_name, type="csv", title="some title", created=today) - board.pin_write(df, pin_name, type="csv", title="some title", created=day_ago) - board.pin_write(df, pin_name, type="csv", title="some title", created=two_days_ago) + board.pin_write( + df, + pin_name, + type="csv", + title="some title", + created=day_ago, + force_identical_write=True, + ) + board.pin_write( + df, + pin_name, + type="csv", + title="some title", + created=two_days_ago, + force_identical_write=True, + ) versions = board.pin_versions(pin_name, as_df=False) assert len(versions) == 3 @@ -423,7 +522,7 @@ def test_board_pin_versions_prune_n(board, pin_prune, pin_name, n): @pytest.mark.parametrize("days", [1, 2]) def test_board_pin_versions_prune_days(board, pin_prune, pin_name, days): - # RStudio cannot handle days, since it involves pulling metadata + # Posit cannot handle days, since it involves pulling metadata if board.fs.protocol == "rsc": with pytest.raises(NotImplementedError): board.pin_versions_prune(pin_name, days=days) @@ -437,6 +536,33 @@ def test_board_pin_versions_prune_days(board, pin_prune, pin_name, days): assert len(new_versions) == days +def test_board_pin_versions_prune_days_protect_most_recent(board, pin_name): + """To address https://github.com/rstudio/pins-python/issues/297""" + # Posit cannot handle days, since it involves pulling metadata + if board.fs.protocol == "rsc": + with pytest.raises(NotImplementedError): + board.pin_versions_prune(pin_name, days=5) + return + + today = datetime.now() + two_days_ago = today - timedelta(days=2, minutes=1) + three_days_ago = today - timedelta(days=3, minutes=1) + + # Note, we are _not_ going to write a pin for today, otherwise we wouldn't be + # properly testing the protection of the most recent version - it would be trivially + # protected because it would always lie in the last day / fraction of a day. + board.pin_write({"a": 1}, pin_name, type="json", created=two_days_ago) + assert len(board.pin_versions(pin_name, as_df=False)) == 1 + board.pin_write({"a": 2}, pin_name, type="json", created=three_days_ago) + + # prune the versions, keeping only the most recent + board.pin_versions_prune(pin_name, days=1) + + # check that only the most recent version remains + versions = board.pin_versions(pin_name, as_df=False) + assert len(versions) == 1 + + # pin_search ================================================================== @@ -502,7 +628,7 @@ def test_board_base_pin_meta_cache_touch(tmp_path: Path, df): assert orig_access < new_access -# RStudio Connect specific ==================================================== +# Posit Connect specific ==================================================== # import fixture that builds / tearsdown user "susan" from pins.tests.test_rsconnect_api import ( # noqa @@ -546,7 +672,9 @@ def test_board_pin_search_admin_user(df, board_short, fs_admin): # noqa @pytest.mark.fs_rsc def test_board_rsc_pin_write_title_update(df, board_short): board_short.pin_write(df, "susan/some_df", type="csv", title="title a") - board_short.pin_write(df, "susan/some_df", type="csv", title="title b") + board_short.pin_write( + df, "susan/some_df", type="csv", title="title b", force_identical_write=True + ) content = board_short.fs.info("susan/some_df") assert content["title"] == "title b" diff --git a/pins/tests/test_drivers.py b/pins/tests/test_drivers.py index 230f0e80..351550af 100644 --- a/pins/tests/test_drivers.py +++ b/pins/tests/test_drivers.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from pathlib import Path import fsspec @@ -5,7 +7,7 @@ import pytest from pins.config import PINS_ENV_INSECURE_READ -from pins.drivers import default_title, load_data, save_data +from pins.drivers import default_title, load_data, load_path, save_data from pins.errors import PinsInsecureReadError from pins.meta import MetaRaw from pins.tests.helpers import rm_env @@ -159,3 +161,26 @@ def test_driver_apply_suffix_false(tmp_path: Path): res_fname = save_data(df, p_obj, type_, apply_suffix=False) assert Path(res_fname).name == "some_df" + + +class TestLoadFile: + def test_str_file(self): + class _MockMetaStrFile: + file: str = "a" + type: str = "csv" + + assert load_path(_MockMetaStrFile().file, None, _MockMetaStrFile().type) == "a" + + def test_table(self): + class _MockMetaTable: + file: str = "a" + type: str = "table" + + assert load_path(_MockMetaTable().file, None, _MockMetaTable().type) == "data.csv" + + def test_version(self): + class _MockMetaTable: + file: str = "a" + type: str = "csv" + + assert load_path(_MockMetaTable().file, "v1", _MockMetaTable().type) == "v1/a" diff --git a/pins/tests/test_rsconnect_api.py b/pins/tests/test_rsconnect_api.py index aa754e69..ce3e7c2c 100644 --- a/pins/tests/test_rsconnect_api.py +++ b/pins/tests/test_rsconnect_api.py @@ -5,6 +5,7 @@ from requests.exceptions import HTTPError from pins.rsconnect.api import ( + RsConnectApi, RsConnectApiMissingContentError, RsConnectApiRequestError, ) @@ -69,6 +70,13 @@ def test_rsconnect_api_get_user(rsc_admin): assert me["username"] == "admin" +def test_rsconnect_api_get_user_guid(rsc_admin: RsConnectApi): + guid = rsc_admin.get_user().get_id() + me = rsc_admin.get_user(guid=guid) + assert me.get_id() == guid + assert me["username"] == "admin" + + def test_rsconnect_get_content_empty(rsc_short): me = rsc_short.get_user() content = rsc_short.get_content(owner_guid=me["guid"]) @@ -220,7 +228,10 @@ def test_rsconnect_api_get_content_bundle_archive(rsc_short): bundle = create_content_bundle(rsc_short, content["guid"]) # create temporary directories for content source and dest to download to ---- - with tempfile.TemporaryDirectory() as tmp_src, tempfile.TemporaryDirectory() as tmp_dst: + with ( + tempfile.TemporaryDirectory() as tmp_src, + tempfile.TemporaryDirectory() as tmp_dst, + ): create_content_bundle(rsc_short, content["guid"], tmp_src) # download .tar.gz archive to a temporary file and unzip ---- diff --git a/pins/utils.py b/pins/utils.py index a229dc25..7525c148 100644 --- a/pins/utils.py +++ b/pins/utils.py @@ -22,10 +22,10 @@ def warn_deprecated(msg): def hash_name(path, same_name): if same_name: - hash = os.path.basename(path) + _hash = os.path.basename(path) else: - hash = hashlib.sha256(path.encode()).hexdigest() - return hash + _hash = hashlib.sha256(path.encode()).hexdigest() + return _hash class ExtendMethodDoc: diff --git a/pins/versions.py b/pins/versions.py index 8b55fdb6..c05babcf 100644 --- a/pins/versions.py +++ b/pins/versions.py @@ -1,9 +1,10 @@ from __future__ import annotations import logging +from collections.abc import Mapping, Sequence from dataclasses import asdict, dataclass from datetime import datetime -from typing import Mapping, Sequence +from pathlib import Path from xxhash import xxh64 @@ -56,9 +57,7 @@ def render_created(self): def hash_file(f: IOBase, block_size: int = -1) -> str: # TODO: what kind of things implement the "buffer API"? hasher = xxh64() - buf = f.read(block_size) - while len(buf) > 0: hasher.update(buf) buf = f.read(block_size) @@ -99,14 +98,18 @@ def from_files( ) -> Version: hashes = [] for f in files: - hash_ = cls.hash_file(open(f, "rb") if isinstance(f, str) else f) + hash_ = cls.hash_file(open(f, "rb") if isinstance(f, (str, Path)) else f) hashes.append(hash_) if created is None: created = datetime.now() if len(hashes) > 1: - raise NotImplementedError("Only 1 file may be currently be hashed") + # Combine the hashes into a single string + combined_hashes = "".join(hashes) + + # Create an xxh64 hash of the combined string + hashes = [xxh64(combined_hashes).hexdigest()] return cls(created, hashes[0]) diff --git a/pyproject.toml b/pyproject.toml index 79b25f14..4f1fabcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,14 +10,14 @@ maintainers = [{ name = "Isabel Zimmerman", email = "isabel.zimmerman@posit.co" keywords = ["data", "tidyverse"] classifiers = [ "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "License :: OSI Approved :: MIT License", ] -requires-python = ">=3.8" +requires-python = ">=3.9" dynamic = ["version"] dependencies = [ "appdirs<2", # Using appdirs rather than platformdirs is deliberate, see https://github.com/rstudio/pins-python/pull/239 @@ -61,6 +61,7 @@ test = [ "pytest-dotenv", "pytest-parallel", "s3fs", + "rdata", ] [build-system] @@ -100,21 +101,14 @@ pythonPlatform = "Linux" # Tracking compliance with these rules at https://github.com/rstudio/pins-python/issues/272 reportArgumentType = false -reportAssignmentType = false reportAttributeAccessIssue = false reportCallIssue = false -reportGeneralTypeIssues = false reportIncompatibleMethodOverride = false -reportIncompatibleVariableOverride = false -reportIndexIssue = false -reportMissingImports = false reportMissingTypeStubs = false -reportOptionalIterable = false reportOptionalMemberAccess = false reportOptionalSubscript = false reportPossiblyUnboundVariable = false reportReturnType = false -reportUnusedExpression = false [tool.ruff] line-length = 90 @@ -128,7 +122,9 @@ select = [ "I", # Import sorting "UP", # Upgrade to latest supported Python syntax "W", # Style + "A", # Don't shadow built-ins ] ignore = [ "E501", # Line too long + "A002", # The pins interface includes builtin names in args, e.g. hash, id, etc. ] diff --git a/requirements/dev.txt b/requirements/dev.txt index 54a9bcb9..effdf5a3 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -238,6 +238,8 @@ numpy==2.0.0 # fastparquet # pandas # pyarrow + # rdata + # xarray oauthlib==3.2.2 # via requests-oauthlib packaging==24.1 @@ -247,10 +249,13 @@ packaging==24.1 # ipykernel # pytest # pytest-cases + # xarray pandas==2.2.2 # via # pins (pyproject.toml) # fastparquet + # rdata + # xarray parso==0.8.4 # via jedi pexpect==4.9.0 @@ -345,6 +350,8 @@ pyzmq==26.0.3 # jupyter-client quartodoc==0.7.5 # via pins (pyproject.toml) +rdata==0.11.2 + # via pins (pyproject.toml) referencing==0.35.1 # via # jsonschema @@ -412,6 +419,7 @@ typing-extensions==4.12.2 # pydantic # pydantic-core # quartodoc + # rdata tzdata==2024.1 # via pandas urllib3==2.2.2 @@ -428,6 +436,8 @@ wheel==0.43.0 # via pip-tools wrapt==1.16.0 # via aiobotocore +xarray==2024.11.0 + # via rdata xxhash==3.4.1 # via pins (pyproject.toml) yarl==1.9.4