From 7291c29b5bd6aee8b63721579f3e89bbcb95a360 Mon Sep 17 00:00:00 2001 From: Min RK Date: Fri, 9 Feb 2018 17:56:48 +0100 Subject: [PATCH 1/3] add assemble_subset - get_assemble_files() returns list of files needed for assembly - if buildpack.assemble_with_subset is set, only load assemble_files prior to running assembly scripts. Load the rest of the repo afterward conda opts in to this, but currently I think this works for everything *except* requirements.txt --- repo2docker/buildpacks/base.py | 56 +++++++++++++++++++----- repo2docker/buildpacks/conda/__init__.py | 16 +++++++ 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/repo2docker/buildpacks/base.py b/repo2docker/buildpacks/base.py index 8d5991af5..60e0bd1c4 100644 --- a/repo2docker/buildpacks/base.py +++ b/repo2docker/buildpacks/base.py @@ -93,10 +93,15 @@ {{sd}} {% endfor %} -# Copy and chown stuff. This doubles the size of the repo, because -# you can't actually copy as USER, only as root! Thanks, Docker! +# FIXME: use COPY --chown with docker 17.09 to avoid copy+chown in two steps +{% if assemble_from_subset -%} +{% for f in assemble_files %} +COPY src/{{ f }} ${HOME}/{{ f }} +{% endfor %} +{% else -%} USER root COPY src/ ${HOME} +{% endif -%} RUN chown -R ${NB_USER}:${NB_USER} ${HOME} {% if env -%} @@ -113,6 +118,13 @@ {{ sd }} {% endfor %} +{% if assemble_from_subset -%} +# Load the rest of the repo after assembling the environment +USER root +COPY src/ ${HOME} +RUN chown -R ${NB_USER}:${NB_USER} ${HOME} +{% endif -%} + # Container image Labels! # Put these at the end, since we don't want to rebuild everything # when these change! Did I mention I hate Dockerfile cache semantics? @@ -301,6 +313,27 @@ def get_build_scripts(self): """ return [] + def get_assemble_files(self): + """ + Ordered list of files required to run assemble scripts + + This should be the subset of files in the repository + that are needed to run the assembly scripts. + + If the scripts can be run with a subset of files, + then only these files will be present when the scripts run + and the rest of the repository will be loaded after + running the scripts (for better caching). + Otherwise, the entire repository will be present. + + Only used if assemble_from_subset=True, + which is not the default. + """ + + # whether I can be assembled with a subset of files + # change in subclasses that are sure that they can do this + assemble_from_subset = False + def get_assemble_scripts(self): """ Ordered list of shell script snippets to build the repo into the image. @@ -314,14 +347,6 @@ def get_assemble_scripts(self): the scripts that actually build the repository into the container image. - If this needs to be dynamically determined (based on the presence - or absence of certain files, for example), you can create any - method and decorate it with `traitlets.default('assemble_scripts)` - and the return value of this method is used as the value of - assemble_scripts. You can expect that the script is running in - the current directory of the repository being built when doing - dynamic detection. - You can use environment variable substitutions in both the username and the execution script. """ @@ -396,8 +421,10 @@ def render(self): build_env=self.get_build_env(), env=self.get_env(), labels=self.get_labels(), - build_script_directives=build_script_directives, assemble_script_directives=assemble_script_directives, + assemble_files=self.get_assemble_files(), + assemble_from_subset=self.assemble_from_subset, + build_script_directives=build_script_directives, build_script_files=self.get_build_script_files(), base_packages=sorted(self.get_base_packages()), post_build_scripts=self.get_post_build_scripts(), @@ -489,6 +516,13 @@ def get_env(self): def detect(self): return True + def get_assemble_files(self): + apt_txt = self.binder_path('apt.txt') + files = [] + if os.path.exists(apt_txt): + files.append(apt_txt) + return files + def get_assemble_scripts(self): assemble_scripts = [] try: diff --git a/repo2docker/buildpacks/conda/__init__.py b/repo2docker/buildpacks/conda/__init__.py index 38ac92ff6..aebad078e 100644 --- a/repo2docker/buildpacks/conda/__init__.py +++ b/repo2docker/buildpacks/conda/__init__.py @@ -19,6 +19,10 @@ class CondaBuildPack(BaseImage): Uses miniconda since it is more lightweight than Anaconda. """ + + # conda envs can be installed with a subset of files + assemble_with_subset = True + def get_build_env(self): """Return environment variables to be set. @@ -169,6 +173,18 @@ def py2(self): """Am I building a Python 2 kernel environment?""" return self.python_version and self.python_version.split('.')[0] == '2' + def get_assemble_files(self): + """Specify that assembly only requires environment.yml + + enables caching assembly result even when + repo contents change + """ + assemble_files = super().get_assemble_files() + environment_yml = self.binder_path('environment.yml') + if os.path.exists(environment_yml): + assemble_files.append(environment_yml) + return assemble_files + def get_assemble_scripts(self): """Return series of build-steps specific to this source repository. """ From 25e3cb46eee0f9b59413e58cfb6e26c991b95e5c Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 18 Sep 2018 14:34:27 +0200 Subject: [PATCH 2/3] set assemble_from_subset on R, Julia, Docker R and Docker can't assemble from the subset because they can refer to local files. Julia can, though. --- repo2docker/buildpacks/docker.py | 2 ++ repo2docker/buildpacks/julia.py | 12 ++++++++++++ repo2docker/buildpacks/r.py | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/repo2docker/buildpacks/docker.py b/repo2docker/buildpacks/docker.py index 8f3aefa06..c32e2f53f 100644 --- a/repo2docker/buildpacks/docker.py +++ b/repo2docker/buildpacks/docker.py @@ -9,6 +9,8 @@ class DockerBuildPack(BuildPack): """Docker BuildPack""" dockerfile = "Dockerfile" + assemble_from_subset = False + def detect(self): """Check if current repo should be built with the Docker BuildPack""" return os.path.exists(self.binder_path('Dockerfile')) diff --git a/repo2docker/buildpacks/julia.py b/repo2docker/buildpacks/julia.py index 477d50c76..06cbaa4cf 100644 --- a/repo2docker/buildpacks/julia.py +++ b/repo2docker/buildpacks/julia.py @@ -12,6 +12,10 @@ class JuliaBuildPack(CondaBuildPack): See https://github.com/JuliaPy/PyCall.jl/issues/410 """ + + # REQUIRE can't refer to local files, can it? + assemble_from_subset = True + def get_build_env(self): """Get additional environment settings for Julia and Jupyter @@ -86,6 +90,14 @@ def get_build_scripts(self): ) ] + def get_assemble_files(self): + """Add environment files required for assembly step""" + assemble_files = super().get_assemble_files() + require = self.binder_path('REQUIRE') + if os.path.exists(require): + assemble_files.append(require) + return assemble_files + def get_assemble_scripts(self): """ Return series of build-steps specific to "this" Julia repository diff --git a/repo2docker/buildpacks/r.py b/repo2docker/buildpacks/r.py index 57939bbd2..e01a01235 100644 --- a/repo2docker/buildpacks/r.py +++ b/repo2docker/buildpacks/r.py @@ -25,6 +25,11 @@ class RBuildPack(PythonBuildPack): The `r-base` package from Ubuntu apt repositories is used to install R itself, rather than any of the methods from https://cran.r-project.org/. """ + + # cannot assemble from subset because INSTALL.r could + # refer to local files (right?) + assemble_from_subset = False + @property def runtime(self): """ From 11e009394930f779ba5163fd94bf425603e027e8 Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 18 Sep 2018 14:42:58 +0200 Subject: [PATCH 3/3] check requirements.txt for local references allows assemble_from_subset when requirements.txt doesn't contain any local references --- repo2docker/buildpacks/python/__init__.py | 56 +++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/repo2docker/buildpacks/python/__init__.py b/repo2docker/buildpacks/python/__init__.py index fd81560cd..22c4d5170 100644 --- a/repo2docker/buildpacks/python/__init__.py +++ b/repo2docker/buildpacks/python/__init__.py @@ -34,6 +34,62 @@ def python_version(self): self._python_version = py_version return self._python_version + def get_assemble_files(self): + assemble_files = super().get_assemble_files() + for name in ('requirements.txt', 'requirements3.txt'): + requirements_txt = self.binder_path(name) + if os.path.exists(requirements_txt): + assemble_files.append(requirements_txt) + return assemble_files + + def _is_local_requirement(self, line): + """Return whether a line in a requirements.txt file references a local file""" + # trim comments and skip empty lines + line = line.split('#', 1)[0].strip() + if not line: + return False + if line.startswith(('-r', '-c')): + # local -r or -c references break isolation + return True + # strip off `-e, etc.` + if line.startswith('-'): + line = line.split(None, 1)[1] + if 'file://' in line: + # file references break isolation + return True + if '://' in line: + # handle git://../local/file + path = line.split('://', 1)[1] + else: + path = line + if path.startswith('.'): + # references a local file + return True + return False + + @property + def assemble_from_subset(self): + """Peek in requirements.txt to determine if we can assemble from only env files + + If there are any local references, e.g. `-e .`, + stage the whole repo prior to installation. + """ + if not os.path.exists('binder') and os.path.exists('setup.py'): + # can't install from subset if we're using setup.py + return False + for name in ('requirements.txt', 'requirements3.txt'): + requirements_txt = self.binder_path(name) + if not os.path.exists(requirements_txt): + continue + with open(requirements_txt) as f: + for line in f: + if self._is_local_requirement(line): + return False + + # didn't find any local references, + # allow assembly from subset + return True + def get_assemble_scripts(self): """Return series of build-steps specific to this repository. """