fix: use glob matching instead of fnmatch. #1407

I didn't understand that fnmatch considers the entire string to be a filename, even if it has slashes in it. This led to incorrect matching. Now we use our own implementation of glob matching to get the correct behavior.
nedbat · Oct 30, 2022 · f93c620 · f93c620
1 parent b3a1d97
commit f93c620
Show file tree

Hide file tree

Showing 6 changed files with 231 additions and 94 deletions.
diff --git a/coverage/files.py b/coverage/files.py
@@ -3,7 +3,6 @@
 
 """File wrangling."""
 
-import fnmatch
 import hashlib
 import ntpath
 import os
@@ -172,7 +171,7 @@ def isabs_anywhere(filename):
 
 
 def prep_patterns(patterns):
-    """Prepare the file patterns for use in a `FnmatchMatcher`.
+    """Prepare the file patterns for use in a `GlobMatcher`.
 
     If a pattern starts with a wildcard, it is used as a pattern
     as-is.  If it does not start with a wildcard, then it is made
@@ -253,15 +252,15 @@ def match(self, module_name):
         return False
 
 
-class FnmatchMatcher:
+class GlobMatcher:
     """A matcher for files by file name pattern."""
     def __init__(self, pats, name="unknown"):
         self.pats = list(pats)
-        self.re = fnmatches_to_regex(self.pats, case_insensitive=env.WINDOWS)
+        self.re = globs_to_regex(self.pats, case_insensitive=env.WINDOWS)
         self.name = name
 
     def __repr__(self):
-        return f"<FnmatchMatcher {self.name} {self.pats!r}>"
+        return f"<GlobMatcher {self.name} {self.pats!r}>"
 
     def info(self):
         """A list of strings for displaying when dumping state."""
@@ -282,37 +281,69 @@ def sep(s):
     return the_sep
 
 
-def fnmatches_to_regex(patterns, case_insensitive=False, partial=False):
-    """Convert fnmatch patterns to a compiled regex that matches any of them.
+# Tokenizer for _glob_to_regex.
+# None as a sub means disallowed.
+G2RX_TOKENS = [(re.compile(rx), sub) for rx, sub in [
+    (r"\*\*\*+", None),             # Can't have ***
+    (r"[^/]+\*\*+", None),          # Can't have x**
+    (r"\*\*+[^/]+", None),          # Can't have **x
+    (r"\*\*/\*\*", None),           # Can't have **/**
+    (r"^\*+/", r"(.*[/\\\\])?"),    # ^*/ matches any prefix-slash, or nothing.
+    (r"/\*+$", r"[/\\\\].*"),       # /*$ matches any slash-suffix.
+    (r"\*\*/", r"(.*[/\\\\])?"),    # **/ matches any subdirs, including none
+    (r"/", r"[/\\\\]"),             # / matches either slash or backslash
+    (r"\*", r"[^/\\\\]*"),          # * matches any number of non slash-likes
+    (r"\?", r"[^/\\\\]"),           # ? matches one non slash-like
+    (r"\[.*?\]", r"\g<0>"),         # [a-f] matches [a-f]
+    (r"[a-zA-Z0-9_-]+", r"\g<0>"),  # word chars match themselves
+    (r"[\[\]+{}]", None),           # Can't have regex special chars
+    (r".", r"\\\g<0>"),             # Anything else is escaped to be safe
+]]
+
+def _glob_to_regex(pattern):
+    """Convert a file-path glob pattern into a regex."""
+    # Turn all backslashes into slashes to simplify the tokenizer.
+    pattern = pattern.replace("\\", "/")
+    if "/" not in pattern:
+        pattern = "**/" + pattern
+    path_rx = []
+    pos = 0
+    while pos < len(pattern):
+        for rx, sub in G2RX_TOKENS:
+            m = rx.match(pattern, pos=pos)
+            if m:
+                if sub is None:
+                    raise ConfigError(f"File pattern can't include {m[0]!r}")
+                path_rx.append(m.expand(sub))
+                pos = m.end()
+                break
+    return "".join(path_rx)
+
+
+def globs_to_regex(patterns, case_insensitive=False, partial=False):
+    """Convert glob patterns to a compiled regex that matches any of them.
 
     Slashes are always converted to match either slash or backslash, for
     Windows support, even when running elsewhere.
 
+    If the pattern has no slash or backslash, then it is interpreted as
+    matching a file name anywhere it appears in the tree.  Otherwise, the glob
+    pattern must match the whole file path.
+
     If `partial` is true, then the pattern will match if the target string
     starts with the pattern. Otherwise, it must match the entire string.
 
     Returns: a compiled regex object.  Use the .match method to compare target
     strings.
 
     """
-    regexes = (fnmatch.translate(pattern) for pattern in patterns)
-    # */ at the start should also match nothing.
-    regexes = (re.sub(r"^\(\?s:\.\*(\\\\|/)", r"(?s:^(.*\1)?", regex) for regex in regexes)
-    # Be agnostic: / can mean backslash or slash.
-    regexes = (re.sub(r"/", r"[\\\\/]", regex) for regex in regexes)
-
-    if partial:
-        # fnmatch always adds a \Z to match the whole string, which we don't
-        # want, so we remove the \Z.  While removing it, we only replace \Z if
-        # followed by paren (introducing flags), or at end, to keep from
-        # destroying a literal \Z in the pattern.
-        regexes = (re.sub(r'\\Z(\(\?|$)', r'\1', regex) for regex in regexes)
-
     flags = 0
     if case_insensitive:
         flags |= re.IGNORECASE
-    compiled = re.compile(join_regex(regexes), flags=flags)
-
+    rx = join_regex(map(_glob_to_regex, patterns))
+    if not partial:
+        rx = rf"(?:{rx})\Z"
+    compiled = re.compile(rx, flags=flags)
     return compiled
 
 
@@ -342,7 +373,7 @@ def pprint(self):
     def add(self, pattern, result):
         """Add the `pattern`/`result` pair to the list of aliases.
 
-        `pattern` is an `fnmatch`-style pattern.  `result` is a simple
+        `pattern` is an `glob`-style pattern.  `result` is a simple
         string.  When mapping paths, if a path starts with a match against
         `pattern`, then that match is replaced with `result`.  This models
         isomorphic source trees being rooted at different places on two
@@ -370,7 +401,7 @@ def add(self, pattern, result):
             pattern += pattern_sep
 
         # Make a regex from the pattern.
-        regex = fnmatches_to_regex([pattern], case_insensitive=True, partial=True)
+        regex = globs_to_regex([pattern], case_insensitive=True, partial=True)
 
         # Normalize the result: it must end with a path separator.
         result_sep = sep(result)

diff --git a/coverage/inorout.py b/coverage/inorout.py
@@ -16,7 +16,7 @@
 from coverage import env
 from coverage.disposition import FileDisposition, disposition_init
 from coverage.exceptions import CoverageException, PluginError
-from coverage.files import TreeMatcher, FnmatchMatcher, ModuleMatcher
+from coverage.files import TreeMatcher, GlobMatcher, ModuleMatcher
 from coverage.files import prep_patterns, find_python_files, canonical_filename
 from coverage.misc import sys_modules_saved
 from coverage.python import source_for_file, source_for_morf
@@ -260,10 +260,10 @@ def debug(msg):
                 self.pylib_match = TreeMatcher(self.pylib_paths, "pylib")
                 debug(f"Python stdlib matching: {self.pylib_match!r}")
         if self.include:
-            self.include_match = FnmatchMatcher(self.include, "include")
+            self.include_match = GlobMatcher(self.include, "include")
             debug(f"Include matching: {self.include_match!r}")
         if self.omit:
-            self.omit_match = FnmatchMatcher(self.omit, "omit")
+            self.omit_match = GlobMatcher(self.omit, "omit")
             debug(f"Omit matching: {self.omit_match!r}")
 
         self.cover_match = TreeMatcher(self.cover_paths, "coverage")

diff --git a/coverage/report.py b/coverage/report.py
@@ -6,7 +6,7 @@
 import sys
 
 from coverage.exceptions import CoverageException, NoDataError, NotPython
-from coverage.files import prep_patterns, FnmatchMatcher
+from coverage.files import prep_patterns, GlobMatcher
 from coverage.misc import ensure_dir_for_file, file_be_gone
 
 
@@ -57,11 +57,11 @@ def get_analysis_to_report(coverage, morfs):
     config = coverage.config
 
     if config.report_include:
-        matcher = FnmatchMatcher(prep_patterns(config.report_include), "report_include")
+        matcher = GlobMatcher(prep_patterns(config.report_include), "report_include")
         file_reporters = [fr for fr in file_reporters if matcher.match(fr.filename)]
 
     if config.report_omit:
-        matcher = FnmatchMatcher(prep_patterns(config.report_omit), "report_omit")
+        matcher = GlobMatcher(prep_patterns(config.report_omit), "report_omit")
         file_reporters = [fr for fr in file_reporters if not matcher.match(fr.filename)]
 
     if not file_reporters:

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -71,7 +71,6 @@ def test_unexecuted_file(self):
         assert missing == [1]
 
     def test_filenames(self):
-
         self.make_file("mymain.py", """\
             import mymod
             a = 1