From aa255a110204d3e31681b3054c38dccf19e5256e Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Sat, 3 Jul 2021 21:53:55 +0300 Subject: [PATCH] output: optimize obj collection (#6277) * output: str only once Calling `__str__` takes a long time and if we are dealing with a large dataset, it will result in a very substantial time. For example, locally, for 700K objects, it took 20sec more. Related to #6276 * output: don't use os.path.join Takes around 2sec for 700K objects. * output: use fs.sep * output: remove unused sep --- dvc/fs/base.py | 2 ++ dvc/fs/dvc.py | 2 ++ dvc/fs/git.py | 2 ++ dvc/fs/local.py | 2 ++ dvc/fs/repo.py | 2 ++ dvc/output.py | 4 +--- 6 files changed, 11 insertions(+), 3 deletions(-) diff --git a/dvc/fs/base.py b/dvc/fs/base.py index d046178364..e7c757bc53 100644 --- a/dvc/fs/base.py +++ b/dvc/fs/base.py @@ -33,6 +33,8 @@ class RemoteMissingDepsError(DvcException): class BaseFileSystem: + sep = "/" + scheme = "base" REQUIRES: ClassVar[Dict[str, str]] = {} PATH_CLS = URLInfo # type: Any diff --git a/dvc/fs/dvc.py b/dvc/fs/dvc.py index b6a651edcf..64a864f338 100644 --- a/dvc/fs/dvc.py +++ b/dvc/fs/dvc.py @@ -23,6 +23,8 @@ class DvcFileSystem(BaseFileSystem): # pylint:disable=abstract-method repo: DVC repo. """ + sep = os.sep + scheme = "local" PARAM_CHECKSUM = "md5" diff --git a/dvc/fs/git.py b/dvc/fs/git.py index cdb5d6d5d7..c020281366 100644 --- a/dvc/fs/git.py +++ b/dvc/fs/git.py @@ -9,6 +9,8 @@ class GitFileSystem(BaseFileSystem): # pylint:disable=abstract-method """Proxies the repo file access methods to Git objects""" + sep = os.sep + scheme = "local" def __init__(self, root_dir, trie): diff --git a/dvc/fs/local.py b/dvc/fs/local.py index d614646e18..ed0bc958f7 100644 --- a/dvc/fs/local.py +++ b/dvc/fs/local.py @@ -14,6 +14,8 @@ class LocalFileSystem(BaseFileSystem): + sep = os.sep + scheme = Schemes.LOCAL PATH_CLS = PathInfo PARAM_CHECKSUM = "md5" diff --git a/dvc/fs/repo.py b/dvc/fs/repo.py index 69e3f456da..1f9a198649 100644 --- a/dvc/fs/repo.py +++ b/dvc/fs/repo.py @@ -31,6 +31,8 @@ class RepoFileSystem(BaseFileSystem): # pylint:disable=abstract-method kwargs: Additional keyword arguments passed to the `DvcFileSystem()`. """ + sep = os.sep + scheme = "local" PARAM_CHECKSUM = "md5" diff --git a/dvc/output.py b/dvc/output.py index ff0c50a550..5b812faa59 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -271,8 +271,6 @@ class Output: IsStageFileError = OutputIsStageFileError # type: Type[DvcException] IsIgnoredError = OutputIsIgnoredError # type: Type[DvcException] - sep = "/" - def __init__( self, stage, @@ -887,7 +885,7 @@ def _set_obj_names(self, obj): obj.name = str(self) if isinstance(obj, Tree): for key, entry_obj in obj: - entry_obj.name = os.path.join(str(self), *key) + entry_obj.name = self.fs.sep.join([obj.name, *key]) def get_used_external( self, **kwargs