diff --git a/mypy/build.py b/mypy/build.py index f0e92f088d98..acf4041ad2fe 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -39,10 +39,10 @@ from mypy.semanal_pass3 import SemanticAnalyzerPass3 from mypy.checker import TypeChecker from mypy.indirection import TypeIndirectionVisitor -from mypy.errors import Errors, CompileError, DecodeError, report_internal_error +from mypy.errors import Errors, CompileError, report_internal_error +from mypy.util import DecodeError from mypy.report import Reports from mypy import moduleinfo -from mypy import util from mypy.fixup import fixup_module_pass_one, fixup_module_pass_two from mypy.nodes import Expression from mypy.options import Options @@ -53,6 +53,7 @@ from mypy.plugin import Plugin, DefaultPlugin, ChainedPlugin from mypy.defaults import PYTHON3_VERSION_MIN from mypy.server.deps import get_dependencies +from mypy.fscache import FileSystemCache, FileSystemMetaCache # Switch to True to produce debug output related to fine-grained incremental @@ -144,6 +145,7 @@ def build(sources: List[BuildSource], bin_dir: Optional[str] = None, saved_cache: Optional[SavedCache] = None, flush_errors: Optional[Callable[[List[str], bool], None]] = None, + fscache: Optional[FileSystemCache] = None, ) -> BuildResult: """Analyze a program. @@ -167,6 +169,7 @@ def build(sources: List[BuildSource], directories; if omitted, use '.' as the data directory saved_cache: optional dict with saved cache state for dmypy (read-write!) flush_errors: optional function to flush errors after a file is processed + fscache: optionally a file-system cacher """ # If we were not given a flush_errors, we use one that will populate those @@ -179,7 +182,8 @@ def default_flush_errors(new_messages: List[str], is_serious: bool) -> None: flush_errors = flush_errors or default_flush_errors try: - result = _build(sources, options, alt_lib_path, bin_dir, saved_cache, flush_errors) + result = _build(sources, options, alt_lib_path, bin_dir, + saved_cache, flush_errors, fscache) result.errors = messages return result except CompileError as e: @@ -199,13 +203,13 @@ def _build(sources: List[BuildSource], bin_dir: Optional[str], saved_cache: Optional[SavedCache], flush_errors: Callable[[List[str], bool], None], + fscache: Optional[FileSystemCache], ) -> BuildResult: # This seems the most reasonable place to tune garbage collection. gc.set_threshold(50000) data_dir = default_data_dir(bin_dir) - - find_module_clear_caches() + fscache = fscache or FileSystemCache(options.python_version) # Determine the default module search path. lib_path = default_lib_path(data_dir, @@ -222,7 +226,7 @@ def _build(sources: List[BuildSource], for source in sources: if source.path: # Include directory of the program file in the module search path. - dir = remove_cwd_prefix_from_path(dirname(source.path)) + dir = remove_cwd_prefix_from_path(fscache, dirname(source.path)) if dir not in lib_path: lib_path.insert(0, dir) @@ -262,7 +266,8 @@ def _build(sources: List[BuildSource], plugin=plugin, errors=errors, saved_cache=saved_cache, - flush_errors=flush_errors) + flush_errors=flush_errors, + fscache=fscache) try: graph = dispatch(sources, manager) @@ -575,6 +580,7 @@ class BuildManager: but is disabled if fine-grained cache loading fails and after an initial fine-grained load. stats: Dict with various instrumentation numbers + fscache: A file system cacher """ def __init__(self, data_dir: str, @@ -587,6 +593,7 @@ def __init__(self, data_dir: str, plugin: Plugin, errors: Errors, flush_errors: Callable[[List[str], bool], None], + fscache: FileSystemCache, saved_cache: Optional[SavedCache] = None, ) -> None: self.start_time = time.time() @@ -615,6 +622,8 @@ def __init__(self, data_dir: str, not options.fine_grained_incremental or options.use_fine_grained_cache) self.saved_cache = saved_cache if saved_cache is not None else {} # type: SavedCache self.stats = {} # type: Dict[str, Any] # Values are ints or floats + self.fscache = fscache + self.find_module_cache = FindModuleCache(self.fscache) def use_fine_grained_cache(self) -> bool: return self.cache_enabled and self.options.use_fine_grained_cache @@ -626,7 +635,7 @@ def maybe_swap_for_shadow_path(self, path: str) -> str: return path def get_stat(self, path: str) -> os.stat_result: - return os.stat(self.maybe_swap_for_shadow_path(path)) + return self.fscache.stat(self.maybe_swap_for_shadow_path(path)) def all_imported_modules_in_file(self, file: MypyFile) -> List[Tuple[int, str, int]]: @@ -699,7 +708,7 @@ def correct_rel_imp(imp: Union[ImportFrom, ImportAll]) -> str: def is_module(self, id: str) -> bool: """Is there a file in the file system corresponding to module id?""" - return find_module(id, self.lib_path) is not None + return self.find_module_cache.find_module(id, self.lib_path) is not None def parse_file(self, id: str, path: str, source: str, ignore_errors: bool) -> MypyFile: """Parse the source of a file with the given name. @@ -784,7 +793,7 @@ def stats_summary(self) -> Mapping[str, object]: return self.stats -def remove_cwd_prefix_from_path(p: str) -> str: +def remove_cwd_prefix_from_path(fscache: FileSystemCache, p: str) -> str: """Remove current working directory prefix from p, if present. Also crawl up until a directory without __init__.py is found. @@ -797,8 +806,8 @@ def remove_cwd_prefix_from_path(p: str) -> str: cur += os.sep # Compute root path. while (p and - (os.path.isfile(os.path.join(p, '__init__.py')) or - os.path.isfile(os.path.join(p, '__init__.pyi')))): + (fscache.isfile(os.path.join(p, '__init__.py')) or + fscache.isfile(os.path.join(p, '__init__.pyi')))): dir, base = os.path.split(p) if not base: break @@ -812,95 +821,50 @@ def remove_cwd_prefix_from_path(p: str) -> str: return p -# Cache find_module: (id, lib_path) -> result. -find_module_cache = {} # type: Dict[Tuple[str, Tuple[str, ...]], Optional[str]] - -# Cache some repeated work within distinct find_module calls: finding which -# elements of lib_path have even the subdirectory they'd need for the module -# to exist. This is shared among different module ids when they differ only -# in the last component. -find_module_dir_cache = {} # type: Dict[Tuple[str, Tuple[str, ...]], List[str]] - -# Cache directory listings. We assume that while one os.listdir() -# call may be more expensive than one os.stat() call, a small number -# of os.stat() calls is quickly more expensive than caching the -# os.listdir() outcome, and the advantage of the latter is that it -# gives us the case-correct filename on Windows and Mac. -find_module_listdir_cache = {} # type: Dict[str, Optional[List[str]]] - -# Cache for is_file() -find_module_is_file_cache = {} # type: Dict[str, bool] - -# Cache for isdir(join(head, tail)) -find_module_isdir_cache = {} # type: Dict[Tuple[str, str], bool] - - -def find_module_clear_caches() -> None: - find_module_cache.clear() - find_module_dir_cache.clear() - find_module_listdir_cache.clear() - find_module_is_file_cache.clear() - find_module_isdir_cache.clear() +class FindModuleCache: + """Module finder with integrated cache. + Module locations and some intermediate results are cached internally + and can be cleared with the clear() method. -def list_dir(path: str) -> Optional[List[str]]: - """Return a cached directory listing. - - Returns None if the path doesn't exist or isn't a directory. + All file system accesses are performed through a FileSystemCache, + which is not ever cleared by this class. If necessary it must be + cleared by client code. """ - res = find_module_listdir_cache.get(path) - if res is None: - try: - res = os.listdir(path) - except OSError: - res = None - find_module_listdir_cache[path] = res - return res + def __init__(self, fscache: Optional[FileSystemMetaCache] = None) -> None: + self.fscache = fscache or FileSystemMetaCache() + # Cache find_module: (id, lib_path) -> result. + self.results = {} # type: Dict[Tuple[str, Tuple[str, ...]], Optional[str]] -def is_file(path: str) -> bool: - """Return whether path exists and is a file. + # Cache some repeated work within distinct find_module calls: finding which + # elements of lib_path have even the subdirectory they'd need for the module + # to exist. This is shared among different module ids when they differ only + # in the last component. + self.dirs = {} # type: Dict[Tuple[str, Tuple[str, ...]], List[str]] - On case-insensitive filesystems (like Mac or Windows) this returns - False if the case of the path's last component does not exactly - match the case found in the filesystem. - """ - res = find_module_is_file_cache.get(path) - if res is None: - head, tail = os.path.split(path) - if not tail: - res = False - else: - names = list_dir(head) - res = names is not None and tail in names and os.path.isfile(path) - find_module_is_file_cache[path] = res - return res + def clear(self) -> None: + self.results.clear() + self.dirs.clear() + def _find_module(self, id: str, lib_path: Tuple[str, ...]) -> Optional[str]: + fscache = self.fscache -def find_module(id: str, lib_path_arg: Iterable[str]) -> Optional[str]: - """Return the path of the module source file, or None if not found.""" - lib_path = tuple(lib_path_arg) - - def find() -> Optional[str]: # If we're looking for a module like 'foo.bar.baz', it's likely that most of the # many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover # that only once and cache it for when we look for modules like 'foo.bar.blah' # that will require the same subdirectory. components = id.split('.') dir_chain = os.sep.join(components[:-1]) # e.g., 'foo/bar' - if (dir_chain, lib_path) not in find_module_dir_cache: + if (dir_chain, lib_path) not in self.dirs: dirs = [] for pathitem in lib_path: # e.g., '/usr/lib/python3.4/foo/bar' - isdir = find_module_isdir_cache.get((pathitem, dir_chain)) - if isdir is None: - dir = os.path.normpath(os.path.join(pathitem, dir_chain)) - isdir = os.path.isdir(dir) - find_module_isdir_cache[pathitem, dir_chain] = isdir - if isdir: + dir = os.path.normpath(os.path.join(pathitem, dir_chain)) + if fscache.isdir(dir): dirs.append(dir) - find_module_dir_cache[dir_chain, lib_path] = dirs - candidate_base_dirs = find_module_dir_cache[dir_chain, lib_path] + self.dirs[dir_chain, lib_path] = dirs + candidate_base_dirs = self.dirs[dir_chain, lib_path] # If we're looking for a module like 'foo.bar.baz', then candidate_base_dirs now # contains just the subdirectories 'foo/bar' that actually exist under the @@ -913,101 +877,67 @@ def find() -> Optional[str]: # Prefer package over module, i.e. baz/__init__.py* over baz.py*. for extension in PYTHON_EXTENSIONS: path = base_path + sepinit + extension - if is_file(path) and verify_module(id, path): + if fscache.isfile_case(path) and verify_module(fscache, id, path): return path # No package, look for module. for extension in PYTHON_EXTENSIONS: path = base_path + extension - if is_file(path) and verify_module(id, path): + if fscache.isfile_case(path) and verify_module(fscache, id, path): return path return None - key = (id, lib_path) - if key not in find_module_cache: - find_module_cache[key] = find() - return find_module_cache[key] + def find_module(self, id: str, lib_path_arg: Iterable[str]) -> Optional[str]: + """Return the path of the module source file, or None if not found.""" + lib_path = tuple(lib_path_arg) + + key = (id, lib_path) + if key not in self.results: + self.results[key] = self._find_module(id, lib_path) + return self.results[key] + + def find_modules_recursive(self, module: str, lib_path: List[str]) -> List[BuildSource]: + module_path = self.find_module(module, lib_path) + if not module_path: + return [] + result = [BuildSource(module_path, module, None)] + if module_path.endswith(('__init__.py', '__init__.pyi')): + # Subtle: this code prefers the .pyi over the .py if both + # exists, and also prefers packages over modules if both x/ + # and x.py* exist. How? We sort the directory items, so x + # comes before x.py and x.pyi. But the preference for .pyi + # over .py is encoded in find_module(); even though we see + # x.py before x.pyi, find_module() will find x.pyi first. We + # use hits to avoid adding it a second time when we see x.pyi. + # This also avoids both x.py and x.pyi when x/ was seen first. + hits = set() # type: Set[str] + for item in sorted(self.fscache.listdir(os.path.dirname(module_path))): + abs_path = os.path.join(os.path.dirname(module_path), item) + if os.path.isdir(abs_path) and \ + (os.path.isfile(os.path.join(abs_path, '__init__.py')) or + os.path.isfile(os.path.join(abs_path, '__init__.pyi'))): + hits.add(item) + result += self.find_modules_recursive(module + '.' + item, lib_path) + elif item != '__init__.py' and item != '__init__.pyi' and \ + item.endswith(('.py', '.pyi')): + mod = item.split('.')[0] + if mod not in hits: + hits.add(mod) + result += self.find_modules_recursive(module + '.' + mod, lib_path) + return result -def find_modules_recursive(module: str, lib_path: List[str]) -> List[BuildSource]: - module_path = find_module(module, lib_path) - if not module_path: - return [] - result = [BuildSource(module_path, module, None)] - if module_path.endswith(('__init__.py', '__init__.pyi')): - # Subtle: this code prefers the .pyi over the .py if both - # exists, and also prefers packages over modules if both x/ - # and x.py* exist. How? We sort the directory items, so x - # comes before x.py and x.pyi. But the preference for .pyi - # over .py is encoded in find_module(); even though we see - # x.py before x.pyi, find_module() will find x.pyi first. We - # use hits to avoid adding it a second time when we see x.pyi. - # This also avoids both x.py and x.pyi when x/ was seen first. - hits = set() # type: Set[str] - for item in sorted(os.listdir(os.path.dirname(module_path))): - abs_path = os.path.join(os.path.dirname(module_path), item) - if os.path.isdir(abs_path) and \ - (os.path.isfile(os.path.join(abs_path, '__init__.py')) or - os.path.isfile(os.path.join(abs_path, '__init__.pyi'))): - hits.add(item) - result += find_modules_recursive(module + '.' + item, lib_path) - elif item != '__init__.py' and item != '__init__.pyi' and \ - item.endswith(('.py', '.pyi')): - mod = item.split('.')[0] - if mod not in hits: - hits.add(mod) - result += find_modules_recursive( - module + '.' + mod, lib_path) - return result - - -def verify_module(id: str, path: str) -> bool: +def verify_module(fscache: FileSystemMetaCache, id: str, path: str) -> bool: """Check that all packages containing id have a __init__ file.""" if path.endswith(('__init__.py', '__init__.pyi')): path = dirname(path) for i in range(id.count('.')): path = dirname(path) - if not any(is_file(os.path.join(path, '__init__{}'.format(extension))) + if not any(fscache.isfile_case(os.path.join(path, '__init__{}'.format(extension))) for extension in PYTHON_EXTENSIONS): return False return True -def read_with_python_encoding(path: str, pyversion: Tuple[int, int]) -> Tuple[str, str]: - """Read the Python file with while obeying PEP-263 encoding detection. - - Returns: - A tuple: the source as a string, and the hash calculated from the binary representation. - """ - source_bytearray = bytearray() - encoding = 'utf8' if pyversion[0] >= 3 else 'ascii' - - with open(path, 'rb') as f: - # read first two lines and check if PEP-263 coding is present - source_bytearray.extend(f.readline()) - source_bytearray.extend(f.readline()) - m = hashlib.md5(source_bytearray) - - # check for BOM UTF-8 encoding and strip it out if present - if source_bytearray.startswith(b'\xef\xbb\xbf'): - encoding = 'utf8' - source_bytearray = source_bytearray[3:] - else: - _encoding, _ = util.find_python_encoding(source_bytearray, pyversion) - # check that the coding isn't mypy. We skip it since - # registering may not have happened yet - if _encoding != 'mypy': - encoding = _encoding - - remainder = f.read() - m.update(remainder) - source_bytearray.extend(remainder) - try: - source_text = source_bytearray.decode(encoding) - except LookupError as lookuperr: - raise DecodeError(str(lookuperr)) - return source_text, m.hexdigest() - - def get_cache_names(id: str, path: str, manager: BuildManager) -> Tuple[str, str]: """Return the file names for the cache files. @@ -1153,7 +1083,6 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str], manager.log('Metadata abandoned for {}: data cache is modified'.format(id)) return None - # TODO: Share stat() outcome with find_module() path = os.path.abspath(path) st = manager.get_stat(path) # TODO: Errors if not stat.S_ISREG(st.st_mode): @@ -1177,8 +1106,7 @@ def validate_meta(meta: Optional[CacheMeta], id: str, path: Optional[str], mtime = int(st.st_mtime) if mtime != meta.mtime or path != meta.path: - with open(path, 'rb') as f: - source_hash = hashlib.md5(f.read()).hexdigest() + source_hash = manager.fscache.md5(path) if source_hash != meta.hash: manager.log('Metadata abandoned for {}: file {} has different hash'.format(id, path)) return None @@ -1400,12 +1328,8 @@ def delete_cache(id: str, path: str, manager: BuildManager) -> None: filesystem. e. Race conditions, where somebody modifies a file while we're - processing. I propose not to modify the algorithm to handle this, - but to detect when this could lead to inconsistencies. (For - example, when we decide on the dependencies based on cache - metadata, and then we decide to re-parse a file because of a stale - dependency, if the re-parsing leads to a different list of - dependencies we should warn the user or start over.) + processing. Solved by using a FileSystemCache. + Steps ----- @@ -1616,7 +1540,7 @@ def __init__(self, # difference and just assume 'builtins' everywhere, # which simplifies code. file_id = '__builtin__' - path = find_module(file_id, manager.lib_path) + path = manager.find_module_cache.find_module(file_id, manager.lib_path) if path: # For non-stubs, look at options.follow_imports: # - normal (default) -> fully analyze @@ -1891,11 +1815,16 @@ def parse_file(self) -> None: if self.path and source is None: try: path = manager.maybe_swap_for_shadow_path(self.path) - source, self.source_hash = read_with_python_encoding( - path, self.options.python_version) + source = manager.fscache.read_with_python_encoding(path) + self.source_hash = manager.fscache.md5(path) except IOError as ioerr: + # ioerr.strerror differs for os.stat failures between Windows and + # other systems, but os.strerror(ioerr.errno) does not, so we use that. + # (We want the error messages to be platform-independent so that the + # tests have predictable output.) raise CompileError([ - "mypy: can't read file '{}': {}".format(self.path, ioerr.strerror)]) + "mypy: can't read file '{}': {}".format( + self.path, os.strerror(ioerr.errno))]) except (UnicodeDecodeError, DecodeError) as decodeerr: raise CompileError([ "mypy: can't decode file '{}': {}".format(self.path, str(decodeerr))]) @@ -1947,11 +1876,6 @@ def compute_dependencies(self) -> None: if self.id != 'builtins' and 'builtins' not in dep_line_map: dependencies.append('builtins') - # NOTE: What to do about race conditions (like editing the - # file while mypy runs)? A previous version of this code - # explicitly checked for this, but ran afoul of other reasons - # for differences (e.g. silent mode). - # Missing dependencies will be moved from dependencies to # suppressed when they fail to be loaded in load_graph. self.dependencies = dependencies @@ -2129,11 +2053,8 @@ def dispatch(sources: List[BuildSource], manager: BuildManager) -> Graph: stubs_found=sum(g.path is not None and g.path.endswith('.pyi') for g in graph.values()), graph_load_time=(t1 - t0), - fm_cache_size=len(find_module_cache), - fm_dir_cache_size=len(find_module_dir_cache), - fm_listdir_cache_size=len(find_module_listdir_cache), - fm_is_file_cache_size=len(find_module_is_file_cache), - fm_isdir_cache_size=len(find_module_isdir_cache), + fm_cache_size=len(manager.find_module_cache.results), + fm_dir_cache_size=len(manager.find_module_cache.dirs), ) if not graph: print("Nothing to do?!") diff --git a/mypy/dmypy_server.py b/mypy/dmypy_server.py index 4a1195d4af7d..61a3c6656ad8 100644 --- a/mypy/dmypy_server.py +++ b/mypy/dmypy_server.py @@ -283,12 +283,16 @@ def check_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str, return self.fine_grained_increment(sources) def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]: - self.fscache = FileSystemCache(self.options.python_version) - self.fswatcher = FileSystemWatcher(self.fscache) + # The file system cache we create gets passed off to + # BuildManager, and thence to FineGrainedBuildManager, which + # assumes responsibility for clearing it after updates. + fscache = FileSystemCache(self.options.python_version) + self.fswatcher = FileSystemWatcher(fscache) self.update_sources(sources) try: result = mypy.build.build(sources=sources, options=self.options, + fscache=fscache, alt_lib_path=self.alt_lib_path) except mypy.errors.CompileError as e: output = ''.join(s + '\n' for s in e.messages) @@ -298,9 +302,7 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict out, err = '', output return {'out': out, 'err': err, 'status': 2} messages = result.errors - manager = result.manager - graph = result.graph - self.fine_grained_manager = FineGrainedBuildManager(manager, graph) + self.fine_grained_manager = FineGrainedBuildManager(result) self.previous_sources = sources # If we are using the fine-grained cache, build hasn't actually done @@ -317,7 +319,6 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict state.path, FileData(st_mtime=float(meta.mtime), st_size=meta.size, md5=meta.hash)) - # Run an update changed, removed = self.find_changed(sources) # Find anything that has had its dependency list change @@ -326,16 +327,14 @@ def initialize_fine_grained(self, sources: List[mypy.build.BuildSource]) -> Dict assert state.path is not None changed.append((state.id, state.path)) - if changed or removed: - messages = self.fine_grained_manager.update(changed, removed) + # Run an update + messages = self.fine_grained_manager.update(changed, removed) else: # Stores the initial state of sources as a side effect. self.fswatcher.find_changed() - self.fscache.flush() - + fscache.flush() status = 1 if messages else 0 - self.previous_messages = messages[:] return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status} def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[str, Any]: @@ -345,19 +344,13 @@ def fine_grained_increment(self, sources: List[mypy.build.BuildSource]) -> Dict[ self.update_sources(sources) changed, removed = self.find_changed(sources) t1 = time.time() - if not (changed or removed): - # Nothing changed -- just produce the same result as before. - messages = self.previous_messages - else: - messages = self.fine_grained_manager.update(changed, removed) + messages = self.fine_grained_manager.update(changed, removed) t2 = time.time() self.fine_grained_manager.manager.log( "fine-grained increment: find_changed: {:.3f}s, update: {:.3f}s".format( t1 - t0, t2 - t1)) status = 1 if messages else 0 - self.previous_messages = messages[:] self.previous_sources = sources - self.fscache.flush() return {'out': ''.join(s + '\n' for s in messages), 'err': '', 'status': status} def update_sources(self, sources: List[mypy.build.BuildSource]) -> None: diff --git a/mypy/errors.py b/mypy/errors.py index 2a7db1da435b..5757b4a4122a 100644 --- a/mypy/errors.py +++ b/mypy/errors.py @@ -616,13 +616,6 @@ def __init__(self, self.module_with_blocker = module_with_blocker -class DecodeError(Exception): - """Exception raised when a file cannot be decoded due to an unknown encoding type. - - Essentially a wrapper for the LookupError raised by `bytearray.decode` - """ - - def remove_path_prefix(path: str, prefix: str) -> str: """If path starts with prefix, return copy of path with the prefix removed. Otherwise, return path. If path is None, return None. diff --git a/mypy/fscache.py b/mypy/fscache.py index 947c697b38c3..75600dba2951 100644 --- a/mypy/fscache.py +++ b/mypy/fscache.py @@ -30,45 +30,21 @@ import os import stat -from typing import Tuple, Dict, List +from typing import Tuple, Dict, List, Optional +from mypy.util import read_with_python_encoding -from mypy.build import read_with_python_encoding -from mypy.errors import DecodeError - -class FileSystemCache: - def __init__(self, pyversion: Tuple[int, int]) -> None: - self.pyversion = pyversion +class FileSystemMetaCache: + def __init__(self) -> None: self.flush() def flush(self) -> None: """Start another transaction and empty all caches.""" self.stat_cache = {} # type: Dict[str, os.stat_result] self.stat_error_cache = {} # type: Dict[str, Exception] - self.read_cache = {} # type: Dict[str, str] - self.read_error_cache = {} # type: Dict[str, Exception] - self.hash_cache = {} # type: Dict[str, str] self.listdir_cache = {} # type: Dict[str, List[str]] self.listdir_error_cache = {} # type: Dict[str, Exception] - - def read_with_python_encoding(self, path: str) -> str: - if path in self.read_cache: - return self.read_cache[path] - if path in self.read_error_cache: - raise self.read_error_cache[path] - - # Need to stat first so that the contents of file are from no - # earlier instant than the mtime reported by self.stat(). - self.stat(path) - - try: - data, md5hash = read_with_python_encoding(path, self.pyversion) - except Exception as err: - self.read_error_cache[path] = err - raise - self.read_cache[path] = data - self.hash_cache[path] = md5hash - return data + self.isfile_case_cache = {} # type: Dict[str, bool] def stat(self, path: str) -> os.stat_result: if path in self.stat_cache: @@ -97,11 +73,40 @@ def listdir(self, path: str) -> List[str]: return results def isfile(self, path: str) -> bool: - st = self.stat(path) + try: + st = self.stat(path) + except OSError: + return False return stat.S_ISREG(st.st_mode) + def isfile_case(self, path: str) -> bool: + """Return whether path exists and is a file. + + On case-insensitive filesystems (like Mac or Windows) this returns + False if the case of the path's last component does not exactly + match the case found in the filesystem. + TODO: We should maybe check the case for some directory components also, + to avoid permitting wrongly-cased *packages*. + """ + if path in self.isfile_case_cache: + return self.isfile_case_cache[path] + head, tail = os.path.split(path) + if not tail: + res = False + else: + try: + names = self.listdir(head) + res = tail in names and self.isfile(path) + except OSError: + res = False + self.isfile_case_cache[path] = res + return res + def isdir(self, path: str) -> bool: - st = self.stat(path) + try: + st = self.stat(path) + except OSError: + return False return stat.S_ISDIR(st.st_mode) def exists(self, path: str) -> bool: @@ -111,6 +116,38 @@ def exists(self, path: str) -> bool: return False return True + +class FileSystemCache(FileSystemMetaCache): + def __init__(self, pyversion: Tuple[int, int]) -> None: + self.pyversion = pyversion + self.flush() + + def flush(self) -> None: + """Start another transaction and empty all caches.""" + super().flush() + self.read_cache = {} # type: Dict[str, str] + self.read_error_cache = {} # type: Dict[str, Exception] + self.hash_cache = {} # type: Dict[str, str] + + def read_with_python_encoding(self, path: str) -> str: + if path in self.read_cache: + return self.read_cache[path] + if path in self.read_error_cache: + raise self.read_error_cache[path] + + # Need to stat first so that the contents of file are from no + # earlier instant than the mtime reported by self.stat(). + self.stat(path) + + try: + data, md5hash = read_with_python_encoding(path, self.pyversion) + except Exception as err: + self.read_error_cache[path] = err + raise + self.read_cache[path] = data + self.hash_cache[path] = md5hash + return data + def md5(self, path: str) -> str: if path not in self.hash_cache: self.read_with_python_encoding(path) diff --git a/mypy/main.py b/mypy/main.py index e31abe086a0c..6a74f3c9be00 100644 --- a/mypy/main.py +++ b/mypy/main.py @@ -535,7 +535,8 @@ def add_invertible_flag(flag: str, .format(special_opts.package)) options.build_type = BuildType.MODULE lib_path = [os.getcwd()] + build.mypy_path() - targets = build.find_modules_recursive(special_opts.package, lib_path) + # TODO: use the same cache as the BuildManager will + targets = build.FindModuleCache().find_modules_recursive(special_opts.package, lib_path) if not targets: fail("Can't find package '{}'".format(special_opts.package)) return targets, options @@ -548,6 +549,7 @@ def add_invertible_flag(flag: str, return targets, options +# TODO: use a FileSystemCache for this def create_source_list(files: Sequence[str], options: Options) -> List[BuildSource]: targets = [] for f in files: diff --git a/mypy/server/update.py b/mypy/server/update.py index fde9d311b938..f3c0d48891ac 100644 --- a/mypy/server/update.py +++ b/mypy/server/update.py @@ -114,9 +114,9 @@ Major todo items: - Fully support multiple type checking passes -- Use mypy.fscache to access file system """ +import os import time import os.path from typing import ( @@ -124,7 +124,7 @@ ) from mypy.build import ( - BuildManager, State, BuildSource, Graph, load_graph, find_module_clear_caches, + BuildManager, State, BuildSource, BuildResult, Graph, load_graph, PRI_INDIRECT, DEBUG_FINE_GRAINED, ) from mypy.checker import DeferredNode @@ -135,6 +135,7 @@ ) from mypy.options import Options from mypy.types import Type +from mypy.fscache import FileSystemCache from mypy.semanal import apply_semantic_analyzer_patches from mypy.server.astdiff import ( snapshot_symbol_table, compare_symbol_table_snapshots, SnapshotItem @@ -150,21 +151,23 @@ class FineGrainedBuildManager: - def __init__(self, - manager: BuildManager, - graph: Graph) -> None: + def __init__(self, result: BuildResult) -> None: """Initialize fine-grained build based on a batch build. Args: + result: Result from the initialized build. + The manager and graph will be taken over by this class. manager: State of the build (mutated by this class) graph: Additional state of the build (only read to initialize state) """ + manager = result.manager self.manager = manager + self.graph = result.graph self.options = manager.options self.previous_modules = get_module_to_path_map(manager) - self.deps = get_all_dependencies(manager, graph, self.options) + self.deps = get_all_dependencies(manager, self.graph, self.options) self.previous_targets_with_errors = manager.errors.targets() - self.graph = graph + self.previous_messages = result.errors[:] # Module, if any, that had blocking errors in the last run as (id, path) tuple. # TODO: Handle blocking errors in the initial build self.blocking_error = None # type: Optional[Tuple[str, str]] @@ -205,13 +208,15 @@ def update(self, A list of errors. """ changed_modules = changed_modules + removed_modules - assert changed_modules or removed_modules, 'No changed modules' - removed_set = {module for module, _ in removed_modules} self.changed_modules = changed_modules - # Reset global caches for the new build. - find_module_clear_caches() + if not changed_modules: + self.manager.fscache.flush() + return self.previous_messages + + # Reset find_module's caches for the new build. + self.manager.find_module_cache.clear() self.triggered = [] self.updated_modules = [] @@ -249,8 +254,10 @@ def update(self, if blocker: self.blocking_error = (next_id, next_path) self.stale = changed_modules - return messages + break + self.manager.fscache.flush() + self.previous_messages = messages[:] return messages def update_single(self, @@ -383,7 +390,7 @@ def update_single_isolated(module: str, manager.log_fine_grained('new module %r' % module) old_modules = dict(manager.modules) - sources = get_sources(previous_modules, [(module, path)]) + sources = get_sources(manager.fscache, previous_modules, [(module, path)]) if module in manager.missing_modules: manager.missing_modules.remove(module) @@ -407,7 +414,7 @@ def update_single_isolated(module: str, remaining_modules = [] return BlockedUpdate(err.module_with_blocker, path, remaining_modules, err.messages) - if not os.path.isfile(path) or force_removed: + if not manager.fscache.isfile(path) or force_removed: delete_module(module, graph, manager) return NormalUpdate(module, path, [], None) @@ -537,13 +544,12 @@ def get_module_to_path_map(manager: BuildManager) -> Dict[str, str]: for module, node in manager.modules.items()} -def get_sources(modules: Dict[str, str], +def get_sources(fscache: FileSystemCache, + modules: Dict[str, str], changed_modules: List[Tuple[str, str]]) -> List[BuildSource]: - # TODO: Race condition when reading from the file system; we should only read each - # bit of external state once during a build to have a consistent view of the world sources = [] for id, path in changed_modules: - if os.path.isfile(path): + if fscache.isfile(path): sources.append(BuildSource(path, id, None)) return sources diff --git a/mypy/stubgen.py b/mypy/stubgen.py index bcb704e0a168..bb9112d5dc67 100755 --- a/mypy/stubgen.py +++ b/mypy/stubgen.py @@ -156,7 +156,7 @@ def find_module_path_and_all(module: str, pyversion: Tuple[int, int], module_all = getattr(mod, '__all__', None) else: # Find module by going through search path. - module_path = mypy.build.find_module(module, ['.'] + search_path) + module_path = mypy.build.FindModuleCache().find_module(module, ['.'] + search_path) if not module_path: raise SystemExit( "Can't find module '{}' (consider using --search-path)".format(module)) @@ -201,7 +201,7 @@ def generate_stub(path: str, include_private: bool = False ) -> None: - source, _ = mypy.build.read_with_python_encoding(path, pyversion) + source, _ = mypy.util.read_with_python_encoding(path, pyversion) options = MypyOptions() options.python_version = pyversion try: diff --git a/mypy/test/testcheck.py b/mypy/test/testcheck.py index bf811a035a03..d406ff6ade44 100644 --- a/mypy/test/testcheck.py +++ b/mypy/test/testcheck.py @@ -7,7 +7,7 @@ from typing import Dict, List, Optional, Set, Tuple from mypy import build, defaults -from mypy.build import BuildSource, find_module_clear_caches +from mypy.build import BuildSource from mypy.test.config import test_temp_dir from mypy.test.data import DataDrivenTestCase, DataSuite from mypy.test.helpers import ( @@ -114,7 +114,6 @@ def clear_cache(self) -> None: shutil.rmtree(dn) def run_case_once(self, testcase: DataDrivenTestCase, incremental_step: int = 0) -> None: - find_module_clear_caches() original_program_text = '\n'.join(testcase.input) module_data = self.parse_module(original_program_text, incremental_step) @@ -292,7 +291,7 @@ def parse_module(self, module_names = m.group(1) out = [] for module_name in module_names.split(' '): - path = build.find_module(module_name, [test_temp_dir]) + path = build.FindModuleCache().find_module(module_name, [test_temp_dir]) assert path is not None, "Can't find ad hoc case file" with open(path) as f: program_text = f.read() diff --git a/mypy/test/testdmypy.py b/mypy/test/testdmypy.py index a51085645186..e5bfdf231bc3 100644 --- a/mypy/test/testdmypy.py +++ b/mypy/test/testdmypy.py @@ -82,7 +82,6 @@ def clear_cache(self) -> None: def run_case_once(self, testcase: DataDrivenTestCase, incremental_step: int) -> None: assert incremental_step >= 1 - build.find_module_clear_caches() original_program_text = '\n'.join(testcase.input) if incremental_step > 1: @@ -261,7 +260,7 @@ def parse_module(self, module_names = m.group(1) out = [] # type: List[Tuple[str, str, Optional[str]]] for module_name in module_names.split(' '): - path = build.find_module(module_name, [test_temp_dir]) + path = build.FindModuleCache().find_module(module_name, [test_temp_dir]) if path is None and module_name.startswith(NON_EXISTENT_PREFIX): # This is a special name for a file that we don't want to exist. assert '.' not in module_name # TODO: Packages not supported here diff --git a/mypy/test/testfinegrained.py b/mypy/test/testfinegrained.py index 034404449828..69bef72eee2a 100644 --- a/mypy/test/testfinegrained.py +++ b/mypy/test/testfinegrained.py @@ -13,7 +13,7 @@ from typing import List, Set, Tuple, Optional, cast from mypy import build -from mypy.build import BuildManager, BuildSource, Graph +from mypy.build import BuildManager, BuildSource from mypy.errors import CompileError from mypy.options import Options from mypy.server.update import FineGrainedBuildManager diff --git a/mypy/test/testgraph.py b/mypy/test/testgraph.py index 33d10c0ae1ee..d5b738aa3ff9 100644 --- a/mypy/test/testgraph.py +++ b/mypy/test/testgraph.py @@ -10,6 +10,7 @@ from mypy.report import Reports from mypy.plugin import Plugin from mypy.errors import Errors +from mypy.fscache import FileSystemCache class GraphSuite(Suite): @@ -38,6 +39,7 @@ def test_scc(self) -> None: def _make_manager(self) -> BuildManager: errors = Errors() options = Options() + fscache = FileSystemCache(options.python_version) manager = BuildManager( data_dir='', lib_path=[], @@ -49,6 +51,7 @@ def _make_manager(self) -> BuildManager: plugin=Plugin(options), errors=errors, flush_errors=lambda msgs, serious: None, + fscache=fscache, ) return manager diff --git a/mypy/test/testmerge.py b/mypy/test/testmerge.py index 4f0f6c2e0fa3..a2d29845dc7f 100644 --- a/mypy/test/testmerge.py +++ b/mypy/test/testmerge.py @@ -5,7 +5,7 @@ from typing import List, Tuple, Dict, Optional from mypy import build -from mypy.build import BuildManager, BuildSource, State, Graph +from mypy.build import BuildManager, BuildSource, BuildResult, State, Graph from mypy.defaults import PYTHON3_VERSION from mypy.errors import Errors, CompileError from mypy.nodes import ( @@ -69,19 +69,20 @@ def run_case(self, testcase: DataDrivenTestCase) -> None: kind = AST main_src = '\n'.join(testcase.input) - messages, manager, graph = self.build(main_src) - assert manager is not None, 'cases where CompileError occurred should not be run' - fine_grained_manager = FineGrainedBuildManager(manager, graph) + result = self.build(main_src) + assert result is not None, 'cases where CompileError occurred should not be run' + result.manager.fscache.flush() + fine_grained_manager = FineGrainedBuildManager(result) a = [] - if messages: - a.extend(messages) + if result.errors: + a.extend(result.errors) target_path = os.path.join(test_temp_dir, 'target.py') shutil.copy(os.path.join(test_temp_dir, 'target.py.next'), target_path) a.extend(self.dump(fine_grained_manager, kind)) - old_subexpr = get_subexpressions(manager.modules['target']) + old_subexpr = get_subexpressions(result.manager.modules['target']) a.append('==>') @@ -102,7 +103,7 @@ def run_case(self, testcase: DataDrivenTestCase) -> None: 'Invalid output ({}, line {})'.format(testcase.file, testcase.line)) - def build(self, source: str) -> Tuple[List[str], Optional[BuildManager], Dict[str, State]]: + def build(self, source: str) -> Optional[BuildResult]: options = Options() options.incremental = True options.fine_grained_incremental = True @@ -118,8 +119,8 @@ def build(self, source: str) -> Tuple[List[str], Optional[BuildManager], Dict[st alt_lib_path=test_temp_dir) except CompileError as e: # TODO: Is it okay to return None? - return e.messages, None, {} - return result.errors, result.manager, result.graph + return None + return result def build_increment(self, manager: FineGrainedBuildManager, module_id: str, path: str) -> Tuple[MypyFile, diff --git a/mypy/util.py b/mypy/util.py index a516041ec06c..7a37b1a1b7e3 100644 --- a/mypy/util.py +++ b/mypy/util.py @@ -2,6 +2,7 @@ import re import subprocess +import hashlib from xml.sax.saxutils import escape from typing import TypeVar, List, Tuple, Optional, Sequence, Dict @@ -60,6 +61,49 @@ def find_python_encoding(text: bytes, pyversion: Tuple[int, int]) -> Tuple[str, return default_encoding, -1 +class DecodeError(Exception): + """Exception raised when a file cannot be decoded due to an unknown encoding type. + + Essentially a wrapper for the LookupError raised by `bytearray.decode` + """ + + +def read_with_python_encoding(path: str, pyversion: Tuple[int, int]) -> Tuple[str, str]: + """Read the Python file with while obeying PEP-263 encoding detection. + + Returns: + A tuple: the source as a string, and the hash calculated from the binary representation. + """ + source_bytearray = bytearray() + encoding = 'utf8' if pyversion[0] >= 3 else 'ascii' + + with open(path, 'rb') as f: + # read first two lines and check if PEP-263 coding is present + source_bytearray.extend(f.readline()) + source_bytearray.extend(f.readline()) + m = hashlib.md5(source_bytearray) + + # check for BOM UTF-8 encoding and strip it out if present + if source_bytearray.startswith(b'\xef\xbb\xbf'): + encoding = 'utf8' + source_bytearray = source_bytearray[3:] + else: + _encoding, _ = find_python_encoding(source_bytearray, pyversion) + # check that the coding isn't mypy. We skip it since + # registering may not have happened yet + if _encoding != 'mypy': + encoding = _encoding + + remainder = f.read() + m.update(remainder) + source_bytearray.extend(remainder) + try: + source_text = source_bytearray.decode(encoding) + except LookupError as lookuperr: + raise DecodeError(str(lookuperr)) + return source_text, m.hexdigest() + + _python2_interpreter = None # type: Optional[str]