fix: use glob matching instead of fnmatch. #1407

I didn't understand that fnmatch considers the entire string to be a filename, even if it has slashes in it. This led to incorrect matching. Now we use our own implementation of glob matching to get the correct behavior.
nedbat · Oct 30, 2022 · ec6205a · webknjaz · Jul 11, 2023 · ec6205a
1 parent b3a1d97
commit ec6205a
Show file tree

Hide file tree

Showing 10 changed files with 284 additions and 117 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -20,6 +20,10 @@ development at the same time, such as 4.5.x and 5.0.
 Unreleased
 ----------
 
+- Fixes to file pattern matching, fixing `issue 1407`_. Previously, `*` would
+  incorrectly match directory separators, making precise matching difficult.
+  This is now fixed.
+
 - Improvements to combining data files when using the
   :ref:`config_run_relative_files` setting:
 
@@ -39,6 +43,7 @@ Unreleased
   implementations other than CPython or PyPy (`issue 1474`_).
 
 .. _issue 991: https://github.com/nedbat/coveragepy/issues/991
+.. _issue 1407: https://github.com/nedbat/coveragepy/issues/1407
 .. _issue 1474: https://github.com/nedbat/coveragepy/issues/1474
 .. _issue 1481: https://github.com/nedbat/coveragepy/issues/1481
 

diff --git a/coverage/files.py b/coverage/files.py
@@ -3,7 +3,6 @@
 
 """File wrangling."""
 
-import fnmatch
 import hashlib
 import ntpath
 import os
@@ -172,7 +171,7 @@ def isabs_anywhere(filename):
 
 
 def prep_patterns(patterns):
-    """Prepare the file patterns for use in a `FnmatchMatcher`.
+    """Prepare the file patterns for use in a `GlobMatcher`.
 
     If a pattern starts with a wildcard, it is used as a pattern
     as-is.  If it does not start with a wildcard, then it is made
@@ -253,15 +252,15 @@ def match(self, module_name):
         return False
 
 
-class FnmatchMatcher:
+class GlobMatcher:
     """A matcher for files by file name pattern."""
     def __init__(self, pats, name="unknown"):
         self.pats = list(pats)
-        self.re = fnmatches_to_regex(self.pats, case_insensitive=env.WINDOWS)
+        self.re = globs_to_regex(self.pats, case_insensitive=env.WINDOWS)
         self.name = name
 
     def __repr__(self):
-        return f"<FnmatchMatcher {self.name} {self.pats!r}>"
+        return f"<GlobMatcher {self.name} {self.pats!r}>"
 
     def info(self):
         """A list of strings for displaying when dumping state."""
@@ -282,37 +281,69 @@ def sep(s):
     return the_sep
 
 
-def fnmatches_to_regex(patterns, case_insensitive=False, partial=False):
-    """Convert fnmatch patterns to a compiled regex that matches any of them.
+# Tokenizer for _glob_to_regex.
+# None as a sub means disallowed.
+G2RX_TOKENS = [(re.compile(rx), sub) for rx, sub in [
+    (r"\*\*\*+", None),             # Can't have ***
+    (r"[^/]+\*\*+", None),          # Can't have x**
+    (r"\*\*+[^/]+", None),          # Can't have **x
+    (r"\*\*/\*\*", None),           # Can't have **/**
+    (r"^\*+/", r"(.*[/\\\\])?"),    # ^*/ matches any prefix-slash, or nothing.
+    (r"/\*+$", r"[/\\\\].*"),       # /*$ matches any slash-suffix.
+    (r"\*\*/", r"(.*[/\\\\])?"),    # **/ matches any subdirs, including none
+    (r"/", r"[/\\\\]"),             # / matches either slash or backslash
+    (r"\*", r"[^/\\\\]*"),          # * matches any number of non slash-likes
+    (r"\?", r"[^/\\\\]"),           # ? matches one non slash-like
+    (r"\[.*?\]", r"\g<0>"),         # [a-f] matches [a-f]
+    (r"[a-zA-Z0-9_-]+", r"\g<0>"),  # word chars match themselves
+    (r"[\[\]+{}]", None),           # Can't have regex special chars
+    (r".", r"\\\g<0>"),             # Anything else is escaped to be safe
+]]
+
+def _glob_to_regex(pattern):
+    """Convert a file-path glob pattern into a regex."""
+    # Turn all backslashes into slashes to simplify the tokenizer.
+    pattern = pattern.replace("\\", "/")
+    if "/" not in pattern:
+        pattern = "**/" + pattern
+    path_rx = []
+    pos = 0
+    while pos < len(pattern):
+        for rx, sub in G2RX_TOKENS:
+            m = rx.match(pattern, pos=pos)
+            if m:
+                if sub is None:
+                    raise ConfigError(f"File pattern can't include {m[0]!r}")
+                path_rx.append(m.expand(sub))
+                pos = m.end()
+                break
+    return "".join(path_rx)
+
+
+def globs_to_regex(patterns, case_insensitive=False, partial=False):
+    """Convert glob patterns to a compiled regex that matches any of them.
 
     Slashes are always converted to match either slash or backslash, for
     Windows support, even when running elsewhere.
 
+    If the pattern has no slash or backslash, then it is interpreted as
+    matching a file name anywhere it appears in the tree.  Otherwise, the glob
+    pattern must match the whole file path.
+
     If `partial` is true, then the pattern will match if the target string
     starts with the pattern. Otherwise, it must match the entire string.
 
     Returns: a compiled regex object.  Use the .match method to compare target
     strings.
 
     """
-    regexes = (fnmatch.translate(pattern) for pattern in patterns)
-    # */ at the start should also match nothing.
-    regexes = (re.sub(r"^\(\?s:\.\*(\\\\|/)", r"(?s:^(.*\1)?", regex) for regex in regexes)
-    # Be agnostic: / can mean backslash or slash.
-    regexes = (re.sub(r"/", r"[\\\\/]", regex) for regex in regexes)
-
-    if partial:
-        # fnmatch always adds a \Z to match the whole string, which we don't
-        # want, so we remove the \Z.  While removing it, we only replace \Z if
-        # followed by paren (introducing flags), or at end, to keep from
-        # destroying a literal \Z in the pattern.
-        regexes = (re.sub(r'\\Z(\(\?|$)', r'\1', regex) for regex in regexes)
-
     flags = 0
     if case_insensitive:
         flags |= re.IGNORECASE
-    compiled = re.compile(join_regex(regexes), flags=flags)
-
+    rx = join_regex(map(_glob_to_regex, patterns))
+    if not partial:
+        rx = rf"(?:{rx})\Z"
+    compiled = re.compile(rx, flags=flags)
     return compiled
 
 
@@ -342,7 +373,7 @@ def pprint(self):
     def add(self, pattern, result):
         """Add the `pattern`/`result` pair to the list of aliases.
 
-        `pattern` is an `fnmatch`-style pattern.  `result` is a simple
+        `pattern` is an `glob`-style pattern.  `result` is a simple
         string.  When mapping paths, if a path starts with a match against
         `pattern`, then that match is replaced with `result`.  This models
         isomorphic source trees being rooted at different places on two
@@ -370,7 +401,7 @@ def add(self, pattern, result):
             pattern += pattern_sep
 
         # Make a regex from the pattern.
-        regex = fnmatches_to_regex([pattern], case_insensitive=True, partial=True)
+        regex = globs_to_regex([pattern], case_insensitive=True, partial=True)
 
         # Normalize the result: it must end with a path separator.
         result_sep = sep(result)

diff --git a/coverage/inorout.py b/coverage/inorout.py
@@ -16,7 +16,7 @@
 from coverage import env
 from coverage.disposition import FileDisposition, disposition_init
 from coverage.exceptions import CoverageException, PluginError
-from coverage.files import TreeMatcher, FnmatchMatcher, ModuleMatcher
+from coverage.files import TreeMatcher, GlobMatcher, ModuleMatcher
 from coverage.files import prep_patterns, find_python_files, canonical_filename
 from coverage.misc import sys_modules_saved
 from coverage.python import source_for_file, source_for_morf
@@ -260,10 +260,10 @@ def debug(msg):
                 self.pylib_match = TreeMatcher(self.pylib_paths, "pylib")
                 debug(f"Python stdlib matching: {self.pylib_match!r}")
         if self.include:
-            self.include_match = FnmatchMatcher(self.include, "include")
+            self.include_match = GlobMatcher(self.include, "include")
             debug(f"Include matching: {self.include_match!r}")
         if self.omit:
-            self.omit_match = FnmatchMatcher(self.omit, "omit")
+            self.omit_match = GlobMatcher(self.omit, "omit")
             debug(f"Omit matching: {self.omit_match!r}")
 
         self.cover_match = TreeMatcher(self.cover_paths, "coverage")

diff --git a/coverage/report.py b/coverage/report.py
@@ -6,7 +6,7 @@
 import sys
 
 from coverage.exceptions import CoverageException, NoDataError, NotPython
-from coverage.files import prep_patterns, FnmatchMatcher
+from coverage.files import prep_patterns, GlobMatcher
 from coverage.misc import ensure_dir_for_file, file_be_gone
 
 
@@ -57,11 +57,11 @@ def get_analysis_to_report(coverage, morfs):
     config = coverage.config
 
     if config.report_include:
-        matcher = FnmatchMatcher(prep_patterns(config.report_include), "report_include")
+        matcher = GlobMatcher(prep_patterns(config.report_include), "report_include")
         file_reporters = [fr for fr in file_reporters if matcher.match(fr.filename)]
 
     if config.report_omit:
-        matcher = FnmatchMatcher(prep_patterns(config.report_omit), "report_omit")
+        matcher = GlobMatcher(prep_patterns(config.report_omit), "report_omit")
         file_reporters = [fr for fr in file_reporters if not matcher.match(fr.filename)]
 
     if not file_reporters:

diff --git a/doc/cmd.rst b/doc/cmd.rst
@@ -342,7 +342,7 @@ single directory, and use the **combine** command to combine them into one
 
     $ coverage combine
 
-You can also name directories or files on the command line::
+You can also name directories or files to be combined on the command line::
 
     $ coverage combine data1.dat windows_data_files/
 
@@ -364,22 +364,6 @@ An existing combined data file is ignored and re-written. If you want to use
 runs, use the ``--append`` switch on the **combine** command.  This behavior
 was the default before version 4.2.
 
-To combine data for a source file, coverage has to find its data in each of the
-data files.  Different test runs may run the same source file from different
-locations. For example, different operating systems will use different paths
-for the same file, or perhaps each Python version is run from a different
-subdirectory.  Coverage needs to know that different file paths are actually
-the same source file for reporting purposes.
-
-You can tell coverage.py how different source locations relate with a
-``[paths]`` section in your configuration file (see :ref:`config_paths`).
-It might be more convenient to use the ``[run] relative_files``
-setting to store relative file paths (see :ref:`relative_files
-<config_run_relative_files>`).
-
-If data isn't combining properly, you can see details about the inner workings
-with ``--debug=pathmap``.
-
 If any of the data files can't be read, coverage.py will print a warning
 indicating the file and the problem.
 
@@ -414,6 +398,28 @@ want to keep those files, use the ``--keep`` command-line option.
 .. [[[end]]] (checksum: 0bdd83f647ee76363c955bedd9ddf749)
 
 
+.. _cmd_combine_remapping:
+
+Re-mapping paths
+................
+
+To combine data for a source file, coverage has to find its data in each of the
+data files.  Different test runs may run the same source file from different
+locations. For example, different operating systems will use different paths
+for the same file, or perhaps each Python version is run from a different
+subdirectory.  Coverage needs to know that different file paths are actually
+the same source file for reporting purposes.
+
+You can tell coverage.py how different source locations relate with a
+``[paths]`` section in your configuration file (see :ref:`config_paths`).
+It might be more convenient to use the ``[run] relative_files``
+setting to store relative file paths (see :ref:`relative_files
+<config_run_relative_files>`).
+
+If data isn't combining properly, you can see details about the inner workings
+with ``--debug=pathmap``.
+
+
 .. _cmd_erase:
 
 Erase data: ``coverage erase``

diff --git a/doc/config.rst b/doc/config.rst
@@ -357,7 +357,7 @@ The first list that has a match will be used.
 The ``--debug=pathmap`` option can be used to log details of the re-mapping of
 paths.  See :ref:`the --debug option <cmd_run_debug>`.
 
-See :ref:`cmd_combine` for more information.
+See :ref:`cmd_combine_remapping` and :ref:`source_glob` for more information.
 
 
 .. _config_report:

diff --git a/doc/source.rst b/doc/source.rst
@@ -59,10 +59,10 @@ removed from the set.
 
 .. highlight:: ini
 
-The ``include`` and ``omit`` file name patterns follow typical shell syntax:
-``*`` matches any number of characters and ``?`` matches a single character.
-Patterns that start with a wildcard character are used as-is, other patterns
-are interpreted relative to the current directory::
+The ``include`` and ``omit`` file name patterns follow common shell syntax,
+described below in :ref:`source_glob`.  Patterns that start with a wildcard
+character are used as-is, other patterns are interpreted relative to the
+current directory::
 
     [run]
     omit =
@@ -77,7 +77,7 @@ The ``source``, ``include``, and ``omit`` values all work together to determine
 the source that will be measured.
 
 If both ``source`` and ``include`` are set, the ``include`` value is ignored
-and a warning is printed on the standard output.
+and a warning is issued.
 
 
 .. _source_reporting:
@@ -103,3 +103,22 @@ reporting.
 
 Note that these are ways of specifying files to measure.  You can also exclude
 individual source lines.  See :ref:`excluding` for details.
+
+
+.. _source_glob:
+
+File patterns
+-------------
+
+File path patterns are used for include and omit, and for combining path
+remapping.  They follow common shell syntax:
+
+- ``*`` matches any number of file name characters, not including the directory
+  separator.
+
+- ``?`` matches a single file name character.
+
+- ``**`` matches any number of nested directory names, including none.
+
+- Both ``/`` and ``\`` will match either a slash or a backslash, to make
+  cross-platform matching easier.
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -71,7 +71,6 @@ def test_unexecuted_file(self):
         assert missing == [1]
 
     def test_filenames(self):
-
         self.make_file("mymain.py", """\
             import mymod
             a = 1