From 290f013dc11a955ff2d87e3610d873c0fef71e55 Mon Sep 17 00:00:00 2001 From: Hugues Date: Fri, 20 May 2022 08:56:30 -0700 Subject: [PATCH] FindModuleCache: optionally leverage BuildSourceSet (#12616) Given a large codebase with folder hierarchy of the form ``` foo/ company/ __init__.py foo/ bar/ company/ __init__.py bar/ baz/ company/ __init__.py baz/ ... ``` with >100 toplevel folders, the time spent in load_graph is dominated by find_module because this operation is itself O(n) where n is the number of input files, which ends up being O(n**2) because it is called for every import statement in the codebase and the way find_module work, it will always scan through each and every one of those toplevel directories for each and every import statement of company.* Introduce a fast path that leverages the fact that for imports within the code being typechecked, we already have a mapping of module import path to file path in BuildSourceSet Gated behind a command line flag (--fast-module-lookup) to assuage concerns about subtle issues in module lookup being introduced by this fast path. --- docs/source/command_line.rst | 23 +++++ docs/source/running_mypy.rst | 1 + mypy/build.py | 34 +------ mypy/main.py | 4 + mypy/modulefinder.py | 114 ++++++++++++++++++++- mypy/options.py | 2 + mypy/test/testcheck.py | 1 + test-data/unit/check-modules-fast.test | 136 +++++++++++++++++++++++++ 8 files changed, 283 insertions(+), 32 deletions(-) create mode 100644 test-data/unit/check-modules-fast.test diff --git a/docs/source/command_line.rst b/docs/source/command_line.rst index 1a35d81a7ee9..908fa799da46 100644 --- a/docs/source/command_line.rst +++ b/docs/source/command_line.rst @@ -212,6 +212,29 @@ imports. By default, mypy will suppress any error messages generated within :pep:`561` compliant packages. Adding this flag will disable this behavior. +.. option:: --fast-module-lookup + + The default logic used to scan through search paths to resolve imports has a + quadratic worse-case behavior in some cases, which is for instance triggered + by a large number of folders sharing a top-level namespace as in: + + foo/ + company/ + foo/ + a.py + bar/ + company/ + bar/ + b.py + baz/ + company/ + baz/ + c.py + ... + + If you are in this situation, you can enable an experimental fast path by + setting the :option:`--fast-module-lookup` option. + .. _platform-configuration: diff --git a/docs/source/running_mypy.rst b/docs/source/running_mypy.rst index 46ad2c65c386..caf05dcdf258 100644 --- a/docs/source/running_mypy.rst +++ b/docs/source/running_mypy.rst @@ -516,6 +516,7 @@ same directory on the search path, only the stub file is used. (However, if the files are in different directories, the one found in the earlier directory is used.) + Other advice and best practices ******************************* diff --git a/mypy/build.py b/mypy/build.py index c0b9aff5ab32..1196356d5bb8 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -43,8 +43,8 @@ from mypy.report import Reports # Avoid unconditional slow import from mypy.fixup import fixup_module from mypy.modulefinder import ( - BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult, - ModuleNotFoundReason + BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths, + ModuleSearchResult, ModuleNotFoundReason ) from mypy.nodes import Expression from mypy.options import Options @@ -107,33 +107,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None: self.errors: List[str] = [] # Filled in by build if desired -class BuildSourceSet: - """Efficiently test a file's membership in the set of build sources.""" - - def __init__(self, sources: List[BuildSource]) -> None: - self.source_text_present = False - self.source_modules: Set[str] = set() - self.source_paths: Set[str] = set() - - for source in sources: - if source.text is not None: - self.source_text_present = True - elif source.path: - self.source_paths.add(source.path) - else: - self.source_modules.add(source.module) - - def is_source(self, file: MypyFile) -> bool: - if file.path and file.path in self.source_paths: - return True - elif file._fullname in self.source_modules: - return True - elif self.source_text_present: - return True - else: - return False - - def build(sources: List[BuildSource], options: Options, alt_lib_path: Optional[str] = None, @@ -630,7 +603,8 @@ def __init__(self, data_dir: str, or options.use_fine_grained_cache) and not has_reporters) self.fscache = fscache - self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options) + self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options, + source_set=self.source_set) self.metastore = create_metastore(options) # a mapping from source files to their corresponding shadow files diff --git a/mypy/main.py b/mypy/main.py index c735bb389a35..57727821274e 100644 --- a/mypy/main.py +++ b/mypy/main.py @@ -881,6 +881,10 @@ def add_invertible_flag(flag: str, '--explicit-package-bases', default=False, help="Use current directory and MYPYPATH to determine module names of files passed", group=code_group) + add_invertible_flag( + '--fast-module-lookup', default=False, + help=argparse.SUPPRESS, + group=code_group) code_group.add_argument( "--exclude", action="append", diff --git a/mypy/modulefinder.py b/mypy/modulefinder.py index 4ab95dd6564f..43cc4fc0a6d3 100644 --- a/mypy/modulefinder.py +++ b/mypy/modulefinder.py @@ -23,6 +23,7 @@ from typing_extensions import Final, TypeAlias as _TypeAlias from mypy.fscache import FileSystemCache +from mypy.nodes import MypyFile from mypy.options import Options from mypy.stubinfo import is_legacy_bundled_package from mypy import pyinfo @@ -126,6 +127,33 @@ def __repr__(self) -> str: self.base_dir) +class BuildSourceSet: + """Helper to efficiently test a file's membership in a set of build sources.""" + + def __init__(self, sources: List[BuildSource]) -> None: + self.source_text_present = False + self.source_modules = {} # type: Dict[str, str] + self.source_paths = set() # type: Set[str] + + for source in sources: + if source.text is not None: + self.source_text_present = True + if source.path: + self.source_paths.add(source.path) + if source.module: + self.source_modules[source.module] = source.path or '' + + def is_source(self, file: MypyFile) -> bool: + if file.path and file.path in self.source_paths: + return True + elif file._fullname in self.source_modules: + return True + elif self.source_text_present: + return True + else: + return False + + class FindModuleCache: """Module finder with integrated cache. @@ -141,8 +169,10 @@ def __init__(self, search_paths: SearchPaths, fscache: Optional[FileSystemCache], options: Optional[Options], - stdlib_py_versions: Optional[StdlibVersions] = None) -> None: + stdlib_py_versions: Optional[StdlibVersions] = None, + source_set: Optional[BuildSourceSet] = None) -> None: self.search_paths = search_paths + self.source_set = source_set self.fscache = fscache or FileSystemCache() # Cache for get_toplevel_possibilities: # search_paths -> (toplevel_id -> list(package_dirs)) @@ -164,6 +194,53 @@ def clear(self) -> None: self.initial_components.clear() self.ns_ancestors.clear() + def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]: + """Fast path to find modules by looking through the input sources + + This is only used when --fast-module-lookup is passed on the command line.""" + if not self.source_set: + return None + + p = self.source_set.source_modules.get(id, None) + if p and self.fscache.isfile(p): + # We need to make sure we still have __init__.py all the way up + # otherwise we might have false positives compared to slow path + # in case of deletion of init files, which is covered by some tests. + # TODO: are there some combination of flags in which this check should be skipped? + d = os.path.dirname(p) + for _ in range(id.count('.')): + if not any(self.fscache.isfile(os.path.join(d, '__init__' + x)) + for x in PYTHON_EXTENSIONS): + return None + d = os.path.dirname(d) + return p + + idx = id.rfind('.') + if idx != -1: + # When we're looking for foo.bar.baz and can't find a matching module + # in the source set, look up for a foo.bar module. + parent = self.find_module_via_source_set(id[:idx]) + if parent is None or not isinstance(parent, str): + return None + + basename, ext = os.path.splitext(parent) + if (not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS) + and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))): + # If we do find such a *module* (and crucially, we don't want a package, + # hence the filtering out of __init__ files, and checking for the presence + # of a folder with a matching name), then we can be pretty confident that + # 'baz' will either be a top-level variable in foo.bar, or will not exist. + # + # Either way, spelunking in other search paths for another 'foo.bar.baz' + # module should be avoided because: + # 1. in the unlikely event that one were found, it's highly likely that + # it would be unrelated to the source being typechecked and therefore + # more likely to lead to erroneous results + # 2. as described in _find_module, in some cases the search itself could + # potentially waste significant amounts of time + return ModuleNotFoundReason.NOT_FOUND + return None + def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs: """Find which elements of a lib_path have the directory a module needs to exist. @@ -229,7 +306,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult elif top_level in self.stdlib_py_versions: use_typeshed = self._typeshed_has_version(top_level) self.results[id] = self._find_module(id, use_typeshed) - if (not fast_path + if (not (fast_path or (self.options is not None and self.options.fast_module_lookup)) and self.results[id] is ModuleNotFoundReason.NOT_FOUND and self._can_find_module_in_parent_dir(id)): self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY @@ -295,6 +372,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool: def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult: fscache = self.fscache + # Fast path for any modules in the current source set. + # This is particularly important when there are a large number of search + # paths which share the first (few) component(s) due to the use of namespace + # packages, for instance: + # foo/ + # company/ + # __init__.py + # foo/ + # bar/ + # company/ + # __init__.py + # bar/ + # baz/ + # company/ + # __init__.py + # baz/ + # + # mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input + # and computes [foo, bar, baz, ...] as the module search path. + # + # This would result in O(n) search for every import of company.*, leading to + # O(n**2) behavior in load_graph as such imports are unsurprisingly present + # at least once, and usually many more times than that, in each and every file + # being parsed. + # + # Thankfully, such cases are efficiently handled by looking up the module path + # via BuildSourceSet. + p = (self.find_module_via_source_set(id) + if (self.options is not None and self.options.fast_module_lookup) + else None) + if p: + return p + # If we're looking for a module like 'foo.bar.baz', it's likely that most of the # many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover # that only once and cache it for when we look for modules like 'foo.bar.blah' diff --git a/mypy/options.py b/mypy/options.py index b8bc53feb89c..254af61a0645 100644 --- a/mypy/options.py +++ b/mypy/options.py @@ -293,6 +293,8 @@ def __init__(self) -> None: self.cache_map: Dict[str, Tuple[str, str]] = {} # Don't properly free objects on exit, just kill the current process. self.fast_exit = True + # fast path for finding modules from source set + self.fast_module_lookup = False # Used to transform source code before parsing if not None # TODO: Make the type precise (AnyStr -> AnyStr) self.transform_source: Optional[Callable[[Any], Any]] = None diff --git a/mypy/test/testcheck.py b/mypy/test/testcheck.py index f0dc4bc6a671..7b995b9cc86b 100644 --- a/mypy/test/testcheck.py +++ b/mypy/test/testcheck.py @@ -40,6 +40,7 @@ 'check-multiple-inheritance.test', 'check-super.test', 'check-modules.test', + 'check-modules-fast.test', 'check-typevar-values.test', 'check-unsupported.test', 'check-unreachable-code.test', diff --git a/test-data/unit/check-modules-fast.test b/test-data/unit/check-modules-fast.test new file mode 100644 index 000000000000..875125c6532b --- /dev/null +++ b/test-data/unit/check-modules-fast.test @@ -0,0 +1,136 @@ +-- Type checker test cases dealing with module lookup edge cases +-- to ensure that --fast-module-lookup matches regular lookup behavior + +[case testModuleLookup] +# flags: --fast-module-lookup +import m +reveal_type(m.a) # N: Revealed type is "m.A" + +[file m.py] +class A: pass +a = A() + +[case testModuleLookupStub] +# flags: --fast-module-lookup +import m +reveal_type(m.a) # N: Revealed type is "m.A" + +[file m.pyi] +class A: pass +a = A() + +[case testModuleLookupFromImport] +# flags: --fast-module-lookup +from m import a +reveal_type(a) # N: Revealed type is "m.A" + +[file m.py] +class A: pass +a = A() + +[case testModuleLookupStubFromImport] +# flags: --fast-module-lookup +from m import a +reveal_type(a) # N: Revealed type is "m.A" + +[file m.pyi] +class A: pass +a = A() + + +[case testModuleLookupWeird] +# flags: --fast-module-lookup +from m import a +reveal_type(a) # N: Revealed type is "builtins.object" +reveal_type(a.b) # N: Revealed type is "m.a.B" + +[file m.py] +class A: pass +a = A() + +[file m/__init__.py] +[file m/a.py] +class B: pass +b = B() + + +[case testModuleLookupWeird2] +# flags: --fast-module-lookup +from m.a import b +reveal_type(b) # N: Revealed type is "m.a.B" + +[file m.py] +class A: pass +a = A() + +[file m/__init__.py] +[file m/a.py] +class B: pass +b = B() + + +[case testModuleLookupWeird3] +# flags: --fast-module-lookup +from m.a import b +reveal_type(b) # N: Revealed type is "m.a.B" + +[file m.py] +class A: pass +a = A() +[file m/__init__.py] +class B: pass +a = B() +[file m/a.py] +class B: pass +b = B() + + +[case testModuleLookupWeird4] +# flags: --fast-module-lookup +import m.a +m.a.b # E: "str" has no attribute "b" + +[file m.py] +class A: pass +a = A() +[file m/__init__.py] +class B: pass +a = 'foo' +b = B() +[file m/a.py] +class C: pass +b = C() + + +[case testModuleLookupWeird5] +# flags: --fast-module-lookup +import m.a as ma +reveal_type(ma.b) # N: Revealed type is "m.a.C" + +[file m.py] +class A: pass +a = A() +[file m/__init__.py] +class B: pass +a = 'foo' +b = B() +[file m/a.py] +class C: pass +b = C() + + +[case testModuleLookupWeird6] +# flags: --fast-module-lookup +from m.a import b +reveal_type(b) # N: Revealed type is "m.a.C" + +[file m.py] +class A: pass +a = A() +[file m/__init__.py] +class B: pass +a = 'foo' +b = B() +[file m/a.py] +class C: pass +b = C()