Skip to content

Commit

Permalink
Avoid capturing Snapshots for previously digested codegen outputs (pa…
Browse files Browse the repository at this point in the history
…ntsbuild#7241)

### Problem

As described in pantsbuild#7229, re-capturing `Snapshots` on noop runs in `SimpleCodegenTask` caused a performance regression for larger codegen usecases.

### Solution

Remove features from the python-exposed `Snapshot` API that would prevent them from being roundtrippable via a `Digest` (including preservation of canonical paths, and preservation of literal ordering... ie. pantsbuild#5802), add support for optimistically loading a `Snapshot` from a `Digest`, and then reuse code to dump/load a `Digest` for the codegen directories to skip `Snapshot` capturing in cases where the `Digest` had already been stored.  

### Result

Very large codegen noop usecase runtimes reduced from `~15.2` seconds to `~3.05` seconds. Fixes pantsbuild#7229, and fixes pantsbuild#5802.
  • Loading branch information
Stu Hood authored Feb 15, 2019
1 parent 594f91f commit fedc91c
Show file tree
Hide file tree
Showing 32 changed files with 461 additions and 357 deletions.
4 changes: 3 additions & 1 deletion examples/src/wire/org/pantsbuild/example/element/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

java_wire_library(
sources=[
'elements.proto', # Order matters here.
# NB: Order matters for these two paths, so we set `ordered_sources=True` below.
'elements.proto',
'compound.proto',
],
dependencies=[
'examples/src/wire/org/pantsbuild/example/temperature',
],
ordered_sources=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(self,
registry_class=None,
enum_options=None,
no_options=None,
ordered_sources=None,
**kwargs):
"""
:param string service_writer: the name of the class to pass as the --service_writer option to
Expand All @@ -43,6 +44,9 @@ def __init__(self,
doubt, specify com.squareup.wire.SimpleServiceWriter
:param list enum_options: list of enums to pass to as the --enum-enum_options option, # optional
:param boolean no_options: boolean that determines if --no_options flag is passed
:param boolean ordered_sources: boolean that declares whether the sources argument represents
literal ordered sources to be passed directly to the compiler. If false, no ordering is
guaranteed for the sources passed to an individual compiler invoke.
"""

if not service_writer and service_writer_options:
Expand All @@ -59,6 +63,7 @@ def __init__(self,
'registry_class': PrimitiveField(registry_class or None),
'enum_options': PrimitiveField(enum_options or []),
'no_options': PrimitiveField(no_options or False),
'ordered_sources': PrimitiveField(ordered_sources or False),
})

super(JavaWireLibrary, self).__init__(payload=payload, **kwargs)
43 changes: 34 additions & 9 deletions src/python/pants/backend/codegen/wire/java/wire_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
from pants.backend.jvm.targets.java_library import JavaLibrary
from pants.backend.jvm.tasks.nailgun_task import NailgunTaskBase
from pants.base.build_environment import get_buildroot
from pants.base.exceptions import TaskError
from pants.base.exceptions import TargetDefinitionException, TaskError
from pants.base.workunit import WorkUnitLabel
from pants.java.jar.jar_dependency import JarDependency
from pants.source.filespec import globs_matches
from pants.task.simple_codegen_task import SimpleCodegenTask
from pants.util.dirutil import fast_relpath


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -61,24 +63,47 @@ def synthetic_target_extra_dependencies(self, target, target_workdir):
wire_runtime_deps_spec = self.get_options().javadeps
return self.resolve_deps([wire_runtime_deps_spec])

def format_args_for_target(self, target, target_workdir):
"""Calculate the arguments to pass to the command line for a single target."""
sources = OrderedSet(target.sources_relative_to_buildroot())

def _compute_sources(self, target):
relative_sources = OrderedSet()
source_roots = set()
for source in sources:
source_roots = OrderedSet()

def capture_and_relativize_to_source_root(source):
source_root = self.context.source_roots.find_by_path(source)
if not source_root:
source_root = self.context.source_roots.find(target)
source_roots.add(source_root.path)
relative_source = os.path.relpath(source, source_root.path)
relative_sources.add(relative_source)
return fast_relpath(source, source_root.path)

if target.payload.get_field_value('ordered_sources'):
# Re-match the filespecs against the sources in order to apply them in the literal order
# they were specified in.
filespec = target.globs_relative_to_buildroot()
excludes = filespec.get('excludes', [])
for filespec in filespec.get('globs', []):
sources = [s for s in target.sources_relative_to_buildroot()
if globs_matches([s], [filespec], excludes)]
if len(sources) != 1:
raise TargetDefinitionException(
target,
'With `ordered_sources=True`, expected one match for each file literal, '
'but got: {} for literal `{}`.'.format(sources, filespec)
)
relative_sources.add(capture_and_relativize_to_source_root(sources[0]))
else:
# Otherwise, use the default (unspecified) snapshot ordering.
for source in target.sources_relative_to_buildroot():
relative_sources.add(capture_and_relativize_to_source_root(source))
return relative_sources, source_roots

def format_args_for_target(self, target, target_workdir):
"""Calculate the arguments to pass to the command line for a single target."""

args = ['--java_out={0}'.format(target_workdir)]

# Add all params in payload to args

relative_sources, source_roots = self._compute_sources(target)

if target.payload.get_field_value('no_options'):
args.append('--no_options')

Expand Down
2 changes: 1 addition & 1 deletion src/python/pants/backend/graph_info/tasks/cloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def console_output(self, targets):
input_snapshots = tuple(
target.sources_snapshot(scheduler=self.context._scheduler) for target in targets
)
input_files = {f.path for snapshot in input_snapshots for f in snapshot.files}
input_files = {f for snapshot in input_snapshots for f in snapshot.files}

# TODO: Work out a nice library-like utility for writing an argfile, as this will be common.
with temporary_dir() as tmpdir:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ def _execute_hermetic_compile(self, cmd, ctx):
# Assume no extra .class files to grab. We'll fix up that case soon.
# Drop the source_root from the file path.
# Assumes `-d .` has been put in the command.
os.path.relpath(f.path.replace('.java', '.class'), ctx.target.target_base)
for f in input_snapshot.files if f.path.endswith('.java')
os.path.relpath(f.replace('.java', '.class'), ctx.target.target_base)
for f in input_snapshot.files if f.endswith('.java')
)
exec_process_request = ExecuteProcessRequest(
argv=tuple(cmd),
Expand Down
25 changes: 4 additions & 21 deletions src/python/pants/backend/jvm/tasks/jvm_compile/rsc/rsc_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
from pants.java.jar.jar_dependency import JarDependency
from pants.reporting.reporting_utils import items_to_report_element
from pants.util.contextutil import Timer
from pants.util.dirutil import (fast_relpath, fast_relpath_optional, maybe_read_file,
safe_file_dump, safe_mkdir)
from pants.util.dirutil import fast_relpath, fast_relpath_optional, safe_mkdir
from pants.util.memo import memoized_property


Expand All @@ -60,22 +59,6 @@ def stdout_contents(wu):
return f.read().rstrip()


def write_digest(output_dir, digest):
safe_file_dump(
'{}.digest'.format(output_dir),
mode='w',
payload='{}:{}'.format(digest.fingerprint, digest.serialized_bytes_length))


def load_digest(output_dir):
read_file = maybe_read_file('{}.digest'.format(output_dir), binary_mode=False)
if read_file:
fingerprint, length = read_file.split(':')
return Digest(fingerprint, int(length))
else:
return None


def _create_desandboxify_fn(possible_path_patterns):
# Takes a collection of possible canonical prefixes, and returns a function that
# if it finds a matching prefix, strips the path prior to the prefix and returns it
Expand Down Expand Up @@ -132,7 +115,7 @@ def __init__(self, *args, **kwargs):

@classmethod
def implementation_version(cls):
return super(RscCompile, cls).implementation_version() + [('RscCompile', 171)]
return super(RscCompile, cls).implementation_version() + [('RscCompile', 172)]

@classmethod
def register_options(cls, register):
Expand Down Expand Up @@ -218,7 +201,7 @@ def pathglob_for(filename):
def to_classpath_entries(paths, scheduler):
# list of path ->
# list of (path, optional<digest>) ->
path_and_digests = [(p, load_digest(os.path.dirname(p))) for p in paths]
path_and_digests = [(p, Digest.load(os.path.dirname(p))) for p in paths]
# partition: list of path, list of tuples
paths_without_digests = [p for (p, d) in path_and_digests if not d]
if paths_without_digests:
Expand Down Expand Up @@ -825,7 +808,7 @@ def _runtool_hermetic(self, main, tool_name, args, distribution, tgt=None, input
raise TaskError(res.stderr)

if output_dir:
write_digest(output_dir, res.output_directory_digest)
res.output_directory_digest.dump(output_dir)
self.context._scheduler.materialize_directories((
DirectoryToMaterialize(
# NB the first element here is the root to materialize into, not the dir to snapshot
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def run_python_test(transitive_hydrated_target, pytest):
# pex27, where it should be hermetically provided in some way.
output_pytest_requirements_pex_filename = 'pytest-with-requirements.pex'
requirements_pex_argv = [
'./{}'.format(pex_snapshot.files[0].path),
'./{}'.format(pex_snapshot.files[0]),
'--python', python_binary,
'-e', 'pytest:main',
'-o', output_pytest_requirements_pex_filename,
Expand Down
2 changes: 1 addition & 1 deletion src/python/pants/engine/build_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def addresses_from_address_families(address_mapper, specs):
"""
# Capture a Snapshot covering all paths for these Specs, then group by directory.
snapshot = yield Get(Snapshot, PathGlobs, _spec_to_globs(address_mapper, specs))
dirnames = {dirname(f.stat.path) for f in snapshot.files}
dirnames = {dirname(f) for f in snapshot.files}
address_families = yield [Get(AddressFamily, Dir(d)) for d in dirnames]
address_family_by_directory = {af.namespace: af for af in address_families}

Expand Down
77 changes: 52 additions & 25 deletions src/python/pants/engine/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@

from __future__ import absolute_import, division, print_function, unicode_literals

import os

from future.utils import binary_type, text_type

from pants.base.project_tree import Dir, File
from pants.engine.objects import Collection
from pants.engine.rules import RootRule
from pants.option.custom_types import GlobExpansionConjunction
from pants.option.global_options import GlobMatchErrorBehavior
from pants.util.objects import datatype
from pants.util.dirutil import maybe_read_file, safe_delete, safe_file_dump
from pants.util.objects import Exactly, datatype


class FileContent(datatype([('path', text_type), ('content', binary_type)])):
Expand Down Expand Up @@ -62,10 +64,6 @@ def __new__(cls, include, exclude=(), glob_match_error_behavior=None, conjunctio
conjunction=GlobExpansionConjunction.create(conjunction, none_is_default=True))


class PathGlobsAndRoot(datatype([('path_globs', PathGlobs), ('root', text_type)])):
pass


class Digest(datatype([('fingerprint', text_type), ('serialized_bytes_length', int)])):
"""A Digest is a content-digest fingerprint, and a length of underlying content.
Expand All @@ -84,6 +82,33 @@ class Digest(datatype([('fingerprint', text_type), ('serialized_bytes_length', i
https://github.com/pantsbuild/pants/issues/5802
"""

@classmethod
def _path(cls, directory):
return '{}.digest'.format(directory.rstrip(os.sep))

@classmethod
def clear(cls, directory):
"""Clear any existing Digest file adjacent to the given directory."""
safe_delete(cls._path(directory))

@classmethod
def load(cls, directory):
"""Load a Digest from a `.digest` file adjacent to the given directory.
:return: A Digest, or None if the Digest did not exist.
"""
read_file = maybe_read_file(cls._path(directory), binary_mode=False)
if read_file:
fingerprint, length = read_file.split(':')
return Digest(fingerprint, int(length))
else:
return None

def dump(self, directory):
"""Dump this Digest object adjacent to the given directory."""
payload = '{}:{}'.format(self.fingerprint, self.serialized_bytes_length)
safe_file_dump(self._path(directory), payload=payload, mode='w')

def __repr__(self):
return '''Digest(fingerprint={}, serialized_bytes_length={})'''.format(
self.fingerprint,
Expand All @@ -94,8 +119,25 @@ def __str__(self):
return repr(self)


class Snapshot(datatype([('directory_digest', Digest), ('path_stats', tuple)])):
"""A Snapshot is a collection of Files and Dirs fingerprinted by their names/content.
class PathGlobsAndRoot(datatype([
('path_globs', PathGlobs),
('root', text_type),
('digest_hint', Exactly(Digest, type(None))),
])):
"""A set of PathGlobs to capture relative to some root (which may exist outside of the buildroot).
If the `digest_hint` is set, it must be the Digest that we would expect to get if we were to
expand and Digest the globs. The hint is an optimization that allows for bypassing filesystem
operations in cases where the expected Digest is known, and the content for the Digest is already
stored.
"""

def __new__(cls, path_globs, root, digest_hint=None):
return super(PathGlobsAndRoot, cls).__new__(cls, path_globs, root, digest_hint)


class Snapshot(datatype([('directory_digest', Digest), ('files', tuple), ('dirs', tuple)])):
"""A Snapshot is a collection of file paths and dir paths fingerprinted by their names/content.
Snapshots are used to make it easier to isolate process execution by fixing the contents
of the files being operated on and easing their movement to and from isolated execution
Expand All @@ -106,22 +148,6 @@ class Snapshot(datatype([('directory_digest', Digest), ('path_stats', tuple)])):
def is_empty(self):
return self == EMPTY_SNAPSHOT

@property
def dirs(self):
return [p for p in self.path_stats if type(p.stat) == Dir]

@property
def dir_stats(self):
return [p.stat for p in self.dirs]

@property
def files(self):
return [p for p in self.path_stats if type(p.stat) == File]

@property
def file_stats(self):
return [p.stat for p in self.files]


class MergedDirectories(datatype([('directories', tuple)])):
pass
Expand Down Expand Up @@ -150,7 +176,8 @@ class UrlToFetch(datatype([('url', text_type), ('digest', Digest)])):

EMPTY_SNAPSHOT = Snapshot(
directory_digest=EMPTY_DIRECTORY_DIGEST,
path_stats=(),
files=(),
dirs=()
)


Expand Down
8 changes: 0 additions & 8 deletions src/python/pants/engine/native.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,10 +690,6 @@ def new_scheduler(self,
construct_snapshot,
construct_file_content,
construct_files_content,
construct_path_stat,
construct_dir,
construct_file,
construct_link,
construct_process_result,
constraint_address,
constraint_path_globs,
Expand Down Expand Up @@ -722,10 +718,6 @@ def tc(constraint):
func(construct_snapshot),
func(construct_file_content),
func(construct_files_content),
func(construct_path_stat),
func(construct_dir),
func(construct_file),
func(construct_link),
func(construct_process_result),
# TypeConstraints.
tc(constraint_address),
Expand Down
7 changes: 1 addition & 6 deletions src/python/pants/engine/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
from pants.base.project_tree import Dir, File, Link
from pants.build_graph.address import Address
from pants.engine.fs import (Digest, DirectoryToMaterialize, FileContent, FilesContent,
MergedDirectories, Path, PathGlobs, PathGlobsAndRoot, Snapshot,
UrlToFetch)
MergedDirectories, PathGlobs, PathGlobsAndRoot, Snapshot, UrlToFetch)
from pants.engine.isolated_process import ExecuteProcessRequest, FallibleExecuteProcessResult
from pants.engine.native import Function, TypeConstraint, TypeId
from pants.engine.nodes import Return, Throw
Expand Down Expand Up @@ -101,10 +100,6 @@ def __init__(
construct_snapshot=Snapshot,
construct_file_content=FileContent,
construct_files_content=FilesContent,
construct_path_stat=Path,
construct_dir=Dir,
construct_file=File,
construct_link=Link,
construct_process_result=FallibleExecuteProcessResult,
constraint_address=constraint_for(Address),
constraint_path_globs=constraint_for(PathGlobs),
Expand Down
3 changes: 3 additions & 0 deletions src/python/pants/source/filespec.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

def glob_to_regex(pattern):
"""Given a glob pattern, return an equivalent regex expression.
TODO: Replace with implementation in `fs.rs`. See https://github.com/pantsbuild/pants/issues/6795.
:param string glob: The glob pattern. "**" matches 0 or more dirs recursively.
"*" only matches patterns in a single dir.
:returns: A regex string that matches same paths as the input glob does.
Expand Down
6 changes: 4 additions & 2 deletions src/python/pants/source/wrapped_globs.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,10 @@ def files(self):

@memoized_property
def files_relative_to_buildroot(self):
fds = self._snapshot.path_stats if self._include_dirs else self._snapshot.files
return tuple(fd.path for fd in fds)
res = self._snapshot.files
if self._include_dirs:
res += self._snapshot.dirs
return res

@property
def files_hash(self):
Expand Down
Loading

0 comments on commit fedc91c

Please sign in to comment.