Skip to content

Commit

Permalink
Overhaul file-system operations to use FileSystemCache (#4623)
Browse files Browse the repository at this point in the history
This overhauls our file-system accesses to (almost) always go through
a FileSystemCache when stat()ing or reading source files. (We don't
use it for reading cache json files, since we seem to only do a single
read on each anyways and so caching doesn't help us any.) This
eliminates a number of potential bugs having to do with seeing
inconsistent file-system state and reduces the number of file
reads+decodes in fine-grained incremental mode.

This also allows us to eliminate a number of caches maintained by
find_module that duplicated functionality of FileSystemCache. Since I
was reluctant to make the FileSystemCache global, I took the
opportunity to make find_module's caches non-global as well.
  • Loading branch information
msullivan authored Mar 14, 2018
1 parent f456718 commit 5ac30ea
Show file tree
Hide file tree
Showing 13 changed files with 273 additions and 275 deletions.
285 changes: 103 additions & 182 deletions mypy/build.py

Large diffs are not rendered by default.

29 changes: 11 additions & 18 deletions mypy/dmypy_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,16 @@ def check_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str,
return self.fine_grained_increment(sources)

def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]:
self.fscache = FileSystemCache(self.options.python_version)
self.fswatcher = FileSystemWatcher(self.fscache)
# The file system cache we create gets passed off to
# BuildManager, and thence to FineGrainedBuildManager, which
# assumes responsibility for clearing it after updates.
fscache = FileSystemCache(self.options.python_version)
self.fswatcher = FileSystemWatcher(fscache)
self.update_sources(sources)
try:
result = mypy.build.build(sources=sources,
options=self.options,
fscache=fscache,
alt_lib_path=self.alt_lib_path)
except mypy.errors.CompileError as e:
output = ''.join(s + '\n' for s in e.messages)
Expand All @@ -298,9 +302,7 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
out, err = '', output
return {'out': out, 'err': err, 'status': 2}
messages = result.errors
manager = result.manager
graph = result.graph
self.fine_grained_manager = FineGrainedBuildManager(manager, graph)
self.fine_grained_manager = FineGrainedBuildManager(result)
self.previous_sources = sources

# If we are using the fine-grained cache, build hasn't actually done
Expand All @@ -317,7 +319,6 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
state.path,
FileData(st_mtime=float(meta.mtime), st_size=meta.size, md5=meta.hash))

# Run an update
changed, removed = self.find_changed(sources)

# Find anything that has had its dependency list change
Expand All @@ -326,16 +327,14 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict
assert state.path is not None
changed.append((state.id, state.path))

if changed or removed:
messages = self.fine_grained_manager.update(changed, removed)
# Run an update
messages = self.fine_grained_manager.update(changed, removed)
else:
# Stores the initial state of sources as a side effect.
self.fswatcher.find_changed()

self.fscache.flush()

fscache.flush()
status = 1 if messages else 0
self.previous_messages = messages[:]
return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status}

def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]:
Expand All @@ -345,19 +344,13 @@ def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[
self.update_sources(sources)
changed, removed = self.find_changed(sources)
t1 = time.time()
if not (changed or removed):
# Nothing changed -- just produce the same result as before.
messages = self.previous_messages
else:
messages = self.fine_grained_manager.update(changed, removed)
messages = self.fine_grained_manager.update(changed, removed)
t2 = time.time()
self.fine_grained_manager.manager.log(
"fine-grained increment: find_changed: {:.3f}s, update: {:.3f}s".format(
t1 - t0, t2 - t1))
status = 1 if messages else 0
self.previous_messages = messages[:]
self.previous_sources = sources
self.fscache.flush()
return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status}

def update_sources(self, sources: List[mypy.build.BuildSource]) -> None:
Expand Down
7 changes: 0 additions & 7 deletions mypy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,13 +616,6 @@ def __init__(self,
self.module_with_blocker = module_with_blocker


class DecodeError(Exception):
"""Exception raised when a file cannot be decoded due to an unknown encoding type.
Essentially a wrapper for the LookupError raised by `bytearray.decode`
"""


def remove_path_prefix(path: str, prefix: str) -> str:
"""If path starts with prefix, return copy of path with the prefix removed.
Otherwise, return path. If path is None, return None.
Expand Down
99 changes: 68 additions & 31 deletions mypy/fscache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,45 +30,21 @@

import os
import stat
from typing import Tuple, Dict, List
from typing import Tuple, Dict, List, Optional
from mypy.util import read_with_python_encoding

from mypy.build import read_with_python_encoding
from mypy.errors import DecodeError


class FileSystemCache:
def __init__(self, pyversion: Tuple[int, int]) -> None:
self.pyversion = pyversion
class FileSystemMetaCache:
def __init__(self) -> None:
self.flush()

def flush(self) -> None:
"""Start another transaction and empty all caches."""
self.stat_cache = {} # type: Dict[str, os.stat_result]
self.stat_error_cache = {} # type: Dict[str, Exception]
self.read_cache = {} # type: Dict[str, str]
self.read_error_cache = {} # type: Dict[str, Exception]
self.hash_cache = {} # type: Dict[str, str]
self.listdir_cache = {} # type: Dict[str, List[str]]
self.listdir_error_cache = {} # type: Dict[str, Exception]

def read_with_python_encoding(self, path: str) -> str:
if path in self.read_cache:
return self.read_cache[path]
if path in self.read_error_cache:
raise self.read_error_cache[path]

# Need to stat first so that the contents of file are from no
# earlier instant than the mtime reported by self.stat().
self.stat(path)

try:
data, md5hash = read_with_python_encoding(path, self.pyversion)
except Exception as err:
self.read_error_cache[path] = err
raise
self.read_cache[path] = data
self.hash_cache[path] = md5hash
return data
self.isfile_case_cache = {} # type: Dict[str, bool]

def stat(self, path: str) -> os.stat_result:
if path in self.stat_cache:
Expand Down Expand Up @@ -97,11 +73,40 @@ def listdir(self, path: str) -> List[str]:
return results

def isfile(self, path: str) -> bool:
st = self.stat(path)
try:
st = self.stat(path)
except OSError:
return False
return stat.S_ISREG(st.st_mode)

def isfile_case(self, path: str) -> bool:
"""Return whether path exists and is a file.
On case-insensitive filesystems (like Mac or Windows) this returns
False if the case of the path's last component does not exactly
match the case found in the filesystem.
TODO: We should maybe check the case for some directory components also,
to avoid permitting wrongly-cased *packages*.
"""
if path in self.isfile_case_cache:
return self.isfile_case_cache[path]
head, tail = os.path.split(path)
if not tail:
res = False
else:
try:
names = self.listdir(head)
res = tail in names and self.isfile(path)
except OSError:
res = False
self.isfile_case_cache[path] = res
return res

def isdir(self, path: str) -> bool:
st = self.stat(path)
try:
st = self.stat(path)
except OSError:
return False
return stat.S_ISDIR(st.st_mode)

def exists(self, path: str) -> bool:
Expand All @@ -111,6 +116,38 @@ def exists(self, path: str) -> bool:
return False
return True


class FileSystemCache(FileSystemMetaCache):
def __init__(self, pyversion: Tuple[int, int]) -> None:
self.pyversion = pyversion
self.flush()

def flush(self) -> None:
"""Start another transaction and empty all caches."""
super().flush()
self.read_cache = {} # type: Dict[str, str]
self.read_error_cache = {} # type: Dict[str, Exception]
self.hash_cache = {} # type: Dict[str, str]

def read_with_python_encoding(self, path: str) -> str:
if path in self.read_cache:
return self.read_cache[path]
if path in self.read_error_cache:
raise self.read_error_cache[path]

# Need to stat first so that the contents of file are from no
# earlier instant than the mtime reported by self.stat().
self.stat(path)

try:
data, md5hash = read_with_python_encoding(path, self.pyversion)
except Exception as err:
self.read_error_cache[path] = err
raise
self.read_cache[path] = data
self.hash_cache[path] = md5hash
return data

def md5(self, path: str) -> str:
if path not in self.hash_cache:
self.read_with_python_encoding(path)
Expand Down
4 changes: 3 additions & 1 deletion mypy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,8 @@ def add_invertible_flag(flag: str,
.format(special_opts.package))
options.build_type = BuildType.MODULE
lib_path = [os.getcwd()] + build.mypy_path()
targets = build.find_modules_recursive(special_opts.package, lib_path)
# TODO: use the same cache as the BuildManager will
targets = build.FindModuleCache().find_modules_recursive(special_opts.package, lib_path)
if not targets:
fail("Can't find package '{}'".format(special_opts.package))
return targets, options
Expand All @@ -548,6 +549,7 @@ def add_invertible_flag(flag: str,
return targets, options


# TODO: use a FileSystemCache for this
def create_source_list(files: Sequence[str], options: Options) -> List[BuildSource]:
targets = []
for f in files:
Expand Down
42 changes: 24 additions & 18 deletions mypy/server/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,17 @@
Major todo items:
- Fully support multiple type checking passes
- Use mypy.fscache to access file system
"""

import os
import time
import os.path
from typing import (
Dict, List, Set, Tuple, Iterable, Union, Optional, Mapping, NamedTuple, Callable
)

from mypy.build import (
BuildManager, State, BuildSource, Graph, load_graph, find_module_clear_caches,
BuildManager, State, BuildSource, BuildResult, Graph, load_graph,
PRI_INDIRECT, DEBUG_FINE_GRAINED,
)
from mypy.checker import DeferredNode
Expand All @@ -135,6 +135,7 @@
)
from mypy.options import Options
from mypy.types import Type
from mypy.fscache import FileSystemCache
from mypy.semanal import apply_semantic_analyzer_patches
from mypy.server.astdiff import (
snapshot_symbol_table, compare_symbol_table_snapshots, SnapshotItem
Expand All @@ -150,21 +151,23 @@


class FineGrainedBuildManager:
def __init__(self,
manager: BuildManager,
graph: Graph) -> None:
def __init__(self, result: BuildResult) -> None:
"""Initialize fine-grained build based on a batch build.
Args:
result: Result from the initialized build.
The manager and graph will be taken over by this class.
manager: State of the build (mutated by this class)
graph: Additional state of the build (only read to initialize state)
"""
manager = result.manager
self.manager = manager
self.graph = result.graph
self.options = manager.options
self.previous_modules = get_module_to_path_map(manager)
self.deps = get_all_dependencies(manager, graph, self.options)
self.deps = get_all_dependencies(manager, self.graph, self.options)
self.previous_targets_with_errors = manager.errors.targets()
self.graph = graph
self.previous_messages = result.errors[:]
# Module, if any, that had blocking errors in the last run as (id, path) tuple.
# TODO: Handle blocking errors in the initial build
self.blocking_error = None # type: Optional[Tuple[str, str]]
Expand Down Expand Up @@ -205,13 +208,15 @@ def update(self,
A list of errors.
"""
changed_modules = changed_modules + removed_modules
assert changed_modules or removed_modules, 'No changed modules'

removed_set = {module for module, _ in removed_modules}
self.changed_modules = changed_modules

# Reset global caches for the new build.
find_module_clear_caches()
if not changed_modules:
self.manager.fscache.flush()
return self.previous_messages

# Reset find_module's caches for the new build.
self.manager.find_module_cache.clear()

self.triggered = []
self.updated_modules = []
Expand Down Expand Up @@ -249,8 +254,10 @@ def update(self,
if blocker:
self.blocking_error = (next_id, next_path)
self.stale = changed_modules
return messages
break

self.manager.fscache.flush()
self.previous_messages = messages[:]
return messages

def update_single(self,
Expand Down Expand Up @@ -383,7 +390,7 @@ def update_single_isolated(module: str,
manager.log_fine_grained('new module %r' % module)

old_modules = dict(manager.modules)
sources = get_sources(previous_modules, [(module, path)])
sources = get_sources(manager.fscache, previous_modules, [(module, path)])

if module in manager.missing_modules:
manager.missing_modules.remove(module)
Expand All @@ -407,7 +414,7 @@ def update_single_isolated(module: str,
remaining_modules = []
return BlockedUpdate(err.module_with_blocker, path, remaining_modules, err.messages)

if not os.path.isfile(path) or force_removed:
if not manager.fscache.isfile(path) or force_removed:
delete_module(module, graph, manager)
return NormalUpdate(module, path, [], None)

Expand Down Expand Up @@ -537,13 +544,12 @@ def get_module_to_path_map(manager: BuildManager) -> Dict[str, str]:
for module, node in manager.modules.items()}


def get_sources(modules: Dict[str, str],
def get_sources(fscache: FileSystemCache,
modules: Dict[str, str],
changed_modules: List[Tuple[str, str]]) -> List[BuildSource]:
# TODO: Race condition when reading from the file system; we should only read each
# bit of external state once during a build to have a consistent view of the world
sources = []
for id, path in changed_modules:
if os.path.isfile(path):
if fscache.isfile(path):
sources.append(BuildSource(path, id, None))
return sources

Expand Down
4 changes: 2 additions & 2 deletions mypy/stubgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def find_module_path_and_all(module: str, pyversion: Tuple[int, int],
module_all = getattr(mod, '__all__', None)
else:
# Find module by going through search path.
module_path = mypy.build.find_module(module, ['.'] + search_path)
module_path = mypy.build.FindModuleCache().find_module(module, ['.'] + search_path)
if not module_path:
raise SystemExit(
"Can't find module '{}' (consider using --search-path)".format(module))
Expand Down Expand Up @@ -201,7 +201,7 @@ def generate_stub(path: str,
include_private: bool = False
) -> None:

source, _ = mypy.build.read_with_python_encoding(path, pyversion)
source, _ = mypy.util.read_with_python_encoding(path, pyversion)
options = MypyOptions()
options.python_version = pyversion
try:
Expand Down
Loading

0 comments on commit 5ac30ea

Please sign in to comment.