Skip to content

Commit

Permalink
FindModuleCache: optionally leverage BuildSourceSet (#12616)
Browse files Browse the repository at this point in the history
Given a large codebase with folder hierarchy of the form

```
foo/
    company/
        __init__.py
        foo/
bar/
    company/
        __init__.py
        bar/
baz/
    company/
        __init__.py
        baz/
    ...
```

with >100 toplevel folders, the time spent in load_graph
is dominated by find_module because this operation is
itself O(n) where n is the number of input files, which
ends up being O(n**2) because it is called for every import
statement in the codebase and the way find_module work,
it will always scan through each and every one of those
toplevel directories for each and every import statement
of company.*

Introduce a fast path that leverages the fact that for
imports within the code being typechecked, we already
have a mapping of module import path to file path in
BuildSourceSet

Gated behind a command line flag (--fast-module-lookup) to 
assuage concerns about subtle issues in module lookup being 
introduced by this fast path.
  • Loading branch information
huguesb committed May 20, 2022
1 parent 37fb21a commit a6166b2
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 32 deletions.
23 changes: 23 additions & 0 deletions docs/source/command_line.rst
Expand Up @@ -212,6 +212,29 @@ imports.
By default, mypy will suppress any error messages generated within :pep:`561`
compliant packages. Adding this flag will disable this behavior.

.. option:: --fast-module-lookup

The default logic used to scan through search paths to resolve imports has a
quadratic worse-case behavior in some cases, which is for instance triggered
by a large number of folders sharing a top-level namespace as in:

foo/
company/
foo/
a.py
bar/
company/
bar/
b.py
baz/
company/
baz/
c.py
...

If you are in this situation, you can enable an experimental fast path by
setting the :option:`--fast-module-lookup` option.


.. _platform-configuration:

Expand Down
1 change: 1 addition & 0 deletions docs/source/running_mypy.rst
Expand Up @@ -516,6 +516,7 @@ same directory on the search path, only the stub file is used.
(However, if the files are in different directories, the one found
in the earlier directory is used.)


Other advice and best practices
*******************************

Expand Down
34 changes: 4 additions & 30 deletions mypy/build.py
Expand Up @@ -43,8 +43,8 @@
from mypy.report import Reports # Avoid unconditional slow import
from mypy.fixup import fixup_module
from mypy.modulefinder import (
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
ModuleNotFoundReason
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths,
ModuleSearchResult, ModuleNotFoundReason
)
from mypy.nodes import Expression
from mypy.options import Options
Expand Down Expand Up @@ -107,33 +107,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
self.errors: List[str] = [] # Filled in by build if desired


class BuildSourceSet:
"""Efficiently test a file's membership in the set of build sources."""

def __init__(self, sources: List[BuildSource]) -> None:
self.source_text_present = False
self.source_modules: Set[str] = set()
self.source_paths: Set[str] = set()

for source in sources:
if source.text is not None:
self.source_text_present = True
elif source.path:
self.source_paths.add(source.path)
else:
self.source_modules.add(source.module)

def is_source(self, file: MypyFile) -> bool:
if file.path and file.path in self.source_paths:
return True
elif file._fullname in self.source_modules:
return True
elif self.source_text_present:
return True
else:
return False


def build(sources: List[BuildSource],
options: Options,
alt_lib_path: Optional[str] = None,
Expand Down Expand Up @@ -630,7 +603,8 @@ def __init__(self, data_dir: str,
or options.use_fine_grained_cache)
and not has_reporters)
self.fscache = fscache
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
source_set=self.source_set)
self.metastore = create_metastore(options)

# a mapping from source files to their corresponding shadow files
Expand Down
4 changes: 4 additions & 0 deletions mypy/main.py
Expand Up @@ -881,6 +881,10 @@ def add_invertible_flag(flag: str,
'--explicit-package-bases', default=False,
help="Use current directory and MYPYPATH to determine module names of files passed",
group=code_group)
add_invertible_flag(
'--fast-module-lookup', default=False,
help=argparse.SUPPRESS,
group=code_group)
code_group.add_argument(
"--exclude",
action="append",
Expand Down
114 changes: 112 additions & 2 deletions mypy/modulefinder.py
Expand Up @@ -23,6 +23,7 @@
from typing_extensions import Final, TypeAlias as _TypeAlias

from mypy.fscache import FileSystemCache
from mypy.nodes import MypyFile
from mypy.options import Options
from mypy.stubinfo import is_legacy_bundled_package
from mypy import pyinfo
Expand Down Expand Up @@ -126,6 +127,33 @@ def __repr__(self) -> str:
self.base_dir)


class BuildSourceSet:
"""Helper to efficiently test a file's membership in a set of build sources."""

def __init__(self, sources: List[BuildSource]) -> None:
self.source_text_present = False
self.source_modules = {} # type: Dict[str, str]
self.source_paths = set() # type: Set[str]

for source in sources:
if source.text is not None:
self.source_text_present = True
if source.path:
self.source_paths.add(source.path)
if source.module:
self.source_modules[source.module] = source.path or ''

def is_source(self, file: MypyFile) -> bool:
if file.path and file.path in self.source_paths:
return True
elif file._fullname in self.source_modules:
return True
elif self.source_text_present:
return True
else:
return False


class FindModuleCache:
"""Module finder with integrated cache.
Expand All @@ -141,8 +169,10 @@ def __init__(self,
search_paths: SearchPaths,
fscache: Optional[FileSystemCache],
options: Optional[Options],
stdlib_py_versions: Optional[StdlibVersions] = None) -> None:
stdlib_py_versions: Optional[StdlibVersions] = None,
source_set: Optional[BuildSourceSet] = None) -> None:
self.search_paths = search_paths
self.source_set = source_set
self.fscache = fscache or FileSystemCache()
# Cache for get_toplevel_possibilities:
# search_paths -> (toplevel_id -> list(package_dirs))
Expand All @@ -164,6 +194,53 @@ def clear(self) -> None:
self.initial_components.clear()
self.ns_ancestors.clear()

def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
"""Fast path to find modules by looking through the input sources
This is only used when --fast-module-lookup is passed on the command line."""
if not self.source_set:
return None

p = self.source_set.source_modules.get(id, None)
if p and self.fscache.isfile(p):
# We need to make sure we still have __init__.py all the way up
# otherwise we might have false positives compared to slow path
# in case of deletion of init files, which is covered by some tests.
# TODO: are there some combination of flags in which this check should be skipped?
d = os.path.dirname(p)
for _ in range(id.count('.')):
if not any(self.fscache.isfile(os.path.join(d, '__init__' + x))
for x in PYTHON_EXTENSIONS):
return None
d = os.path.dirname(d)
return p

idx = id.rfind('.')
if idx != -1:
# When we're looking for foo.bar.baz and can't find a matching module
# in the source set, look up for a foo.bar module.
parent = self.find_module_via_source_set(id[:idx])
if parent is None or not isinstance(parent, str):
return None

basename, ext = os.path.splitext(parent)
if (not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS)
and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))):
# If we do find such a *module* (and crucially, we don't want a package,
# hence the filtering out of __init__ files, and checking for the presence
# of a folder with a matching name), then we can be pretty confident that
# 'baz' will either be a top-level variable in foo.bar, or will not exist.
#
# Either way, spelunking in other search paths for another 'foo.bar.baz'
# module should be avoided because:
# 1. in the unlikely event that one were found, it's highly likely that
# it would be unrelated to the source being typechecked and therefore
# more likely to lead to erroneous results
# 2. as described in _find_module, in some cases the search itself could
# potentially waste significant amounts of time
return ModuleNotFoundReason.NOT_FOUND
return None

def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
"""Find which elements of a lib_path have the directory a module needs to exist.
Expand Down Expand Up @@ -229,7 +306,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
elif top_level in self.stdlib_py_versions:
use_typeshed = self._typeshed_has_version(top_level)
self.results[id] = self._find_module(id, use_typeshed)
if (not fast_path
if (not (fast_path or (self.options is not None and self.options.fast_module_lookup))
and self.results[id] is ModuleNotFoundReason.NOT_FOUND
and self._can_find_module_in_parent_dir(id)):
self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY
Expand Down Expand Up @@ -295,6 +372,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult:
fscache = self.fscache

# Fast path for any modules in the current source set.
# This is particularly important when there are a large number of search
# paths which share the first (few) component(s) due to the use of namespace
# packages, for instance:
# foo/
# company/
# __init__.py
# foo/
# bar/
# company/
# __init__.py
# bar/
# baz/
# company/
# __init__.py
# baz/
#
# mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
# and computes [foo, bar, baz, ...] as the module search path.
#
# This would result in O(n) search for every import of company.*, leading to
# O(n**2) behavior in load_graph as such imports are unsurprisingly present
# at least once, and usually many more times than that, in each and every file
# being parsed.
#
# Thankfully, such cases are efficiently handled by looking up the module path
# via BuildSourceSet.
p = (self.find_module_via_source_set(id)
if (self.options is not None and self.options.fast_module_lookup)
else None)
if p:
return p

# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
# that only once and cache it for when we look for modules like 'foo.bar.blah'
Expand Down
2 changes: 2 additions & 0 deletions mypy/options.py
Expand Up @@ -293,6 +293,8 @@ def __init__(self) -> None:
self.cache_map: Dict[str, Tuple[str, str]] = {}
# Don't properly free objects on exit, just kill the current process.
self.fast_exit = True
# fast path for finding modules from source set
self.fast_module_lookup = False
# Used to transform source code before parsing if not None
# TODO: Make the type precise (AnyStr -> AnyStr)
self.transform_source: Optional[Callable[[Any], Any]] = None
Expand Down
1 change: 1 addition & 0 deletions mypy/test/testcheck.py
Expand Up @@ -46,6 +46,7 @@
'check-multiple-inheritance.test',
'check-super.test',
'check-modules.test',
'check-modules-fast.test',
'check-typevar-values.test',
'check-unsupported.test',
'check-unreachable-code.test',
Expand Down

0 comments on commit a6166b2

Please sign in to comment.