Skip to content

Commit

Permalink
fix: use glob matching instead of fnmatch. #1407
Browse files Browse the repository at this point in the history
I didn't understand that fnmatch considers the entire string to be a
filename, even if it has slashes in it. This led to incorrect matching.
Now we use our own implementation of glob matching to get the correct
behavior.
  • Loading branch information
nedbat committed Oct 30, 2022
1 parent b3a1d97 commit ec6205a
Show file tree
Hide file tree
Showing 10 changed files with 284 additions and 117 deletions.
5 changes: 5 additions & 0 deletions CHANGES.rst
Expand Up @@ -20,6 +20,10 @@ development at the same time, such as 4.5.x and 5.0.
Unreleased
----------

- Fixes to file pattern matching, fixing `issue 1407`_. Previously, `*` would
incorrectly match directory separators, making precise matching difficult.
This is now fixed.

- Improvements to combining data files when using the
:ref:`config_run_relative_files` setting:

Expand All @@ -39,6 +43,7 @@ Unreleased
implementations other than CPython or PyPy (`issue 1474`_).

.. _issue 991: https://github.com/nedbat/coveragepy/issues/991
.. _issue 1407: https://github.com/nedbat/coveragepy/issues/1407
.. _issue 1474: https://github.com/nedbat/coveragepy/issues/1474
.. _issue 1481: https://github.com/nedbat/coveragepy/issues/1481

Expand Down
79 changes: 55 additions & 24 deletions coverage/files.py
Expand Up @@ -3,7 +3,6 @@

"""File wrangling."""

import fnmatch
import hashlib
import ntpath
import os
Expand Down Expand Up @@ -172,7 +171,7 @@ def isabs_anywhere(filename):


def prep_patterns(patterns):
"""Prepare the file patterns for use in a `FnmatchMatcher`.
"""Prepare the file patterns for use in a `GlobMatcher`.
If a pattern starts with a wildcard, it is used as a pattern
as-is. If it does not start with a wildcard, then it is made
Expand Down Expand Up @@ -253,15 +252,15 @@ def match(self, module_name):
return False


class FnmatchMatcher:
class GlobMatcher:
"""A matcher for files by file name pattern."""
def __init__(self, pats, name="unknown"):
self.pats = list(pats)
self.re = fnmatches_to_regex(self.pats, case_insensitive=env.WINDOWS)
self.re = globs_to_regex(self.pats, case_insensitive=env.WINDOWS)
self.name = name

def __repr__(self):
return f"<FnmatchMatcher {self.name} {self.pats!r}>"
return f"<GlobMatcher {self.name} {self.pats!r}>"

def info(self):
"""A list of strings for displaying when dumping state."""
Expand All @@ -282,37 +281,69 @@ def sep(s):
return the_sep


def fnmatches_to_regex(patterns, case_insensitive=False, partial=False):
"""Convert fnmatch patterns to a compiled regex that matches any of them.
# Tokenizer for _glob_to_regex.
# None as a sub means disallowed.
G2RX_TOKENS = [(re.compile(rx), sub) for rx, sub in [
(r"\*\*\*+", None), # Can't have ***
(r"[^/]+\*\*+", None), # Can't have x**
(r"\*\*+[^/]+", None), # Can't have **x
(r"\*\*/\*\*", None), # Can't have **/**
(r"^\*+/", r"(.*[/\\\\])?"), # ^*/ matches any prefix-slash, or nothing.
(r"/\*+$", r"[/\\\\].*"), # /*$ matches any slash-suffix.

This comment has been minimized.

Copy link
@webknjaz

webknjaz Jul 11, 2023

Contributor

@nedbat is this one really needed? Any specific reason not to want to require /** at the end of a pattern? It seems counterintuitive that /thing/* is actually /thing/**, with the matcher change.

(r"\*\*/", r"(.*[/\\\\])?"), # **/ matches any subdirs, including none
(r"/", r"[/\\\\]"), # / matches either slash or backslash
(r"\*", r"[^/\\\\]*"), # * matches any number of non slash-likes
(r"\?", r"[^/\\\\]"), # ? matches one non slash-like
(r"\[.*?\]", r"\g<0>"), # [a-f] matches [a-f]
(r"[a-zA-Z0-9_-]+", r"\g<0>"), # word chars match themselves
(r"[\[\]+{}]", None), # Can't have regex special chars
(r".", r"\\\g<0>"), # Anything else is escaped to be safe
]]

def _glob_to_regex(pattern):
"""Convert a file-path glob pattern into a regex."""
# Turn all backslashes into slashes to simplify the tokenizer.
pattern = pattern.replace("\\", "/")
if "/" not in pattern:
pattern = "**/" + pattern
path_rx = []
pos = 0
while pos < len(pattern):
for rx, sub in G2RX_TOKENS:
m = rx.match(pattern, pos=pos)
if m:
if sub is None:
raise ConfigError(f"File pattern can't include {m[0]!r}")
path_rx.append(m.expand(sub))
pos = m.end()
break
return "".join(path_rx)


def globs_to_regex(patterns, case_insensitive=False, partial=False):
"""Convert glob patterns to a compiled regex that matches any of them.
Slashes are always converted to match either slash or backslash, for
Windows support, even when running elsewhere.
If the pattern has no slash or backslash, then it is interpreted as
matching a file name anywhere it appears in the tree. Otherwise, the glob
pattern must match the whole file path.
If `partial` is true, then the pattern will match if the target string
starts with the pattern. Otherwise, it must match the entire string.
Returns: a compiled regex object. Use the .match method to compare target
strings.
"""
regexes = (fnmatch.translate(pattern) for pattern in patterns)
# */ at the start should also match nothing.
regexes = (re.sub(r"^\(\?s:\.\*(\\\\|/)", r"(?s:^(.*\1)?", regex) for regex in regexes)
# Be agnostic: / can mean backslash or slash.
regexes = (re.sub(r"/", r"[\\\\/]", regex) for regex in regexes)

if partial:
# fnmatch always adds a \Z to match the whole string, which we don't
# want, so we remove the \Z. While removing it, we only replace \Z if
# followed by paren (introducing flags), or at end, to keep from
# destroying a literal \Z in the pattern.
regexes = (re.sub(r'\\Z(\(\?|$)', r'\1', regex) for regex in regexes)

flags = 0
if case_insensitive:
flags |= re.IGNORECASE
compiled = re.compile(join_regex(regexes), flags=flags)

rx = join_regex(map(_glob_to_regex, patterns))
if not partial:
rx = rf"(?:{rx})\Z"
compiled = re.compile(rx, flags=flags)
return compiled


Expand Down Expand Up @@ -342,7 +373,7 @@ def pprint(self):
def add(self, pattern, result):
"""Add the `pattern`/`result` pair to the list of aliases.
`pattern` is an `fnmatch`-style pattern. `result` is a simple
`pattern` is an `glob`-style pattern. `result` is a simple
string. When mapping paths, if a path starts with a match against
`pattern`, then that match is replaced with `result`. This models
isomorphic source trees being rooted at different places on two
Expand Down Expand Up @@ -370,7 +401,7 @@ def add(self, pattern, result):
pattern += pattern_sep

# Make a regex from the pattern.
regex = fnmatches_to_regex([pattern], case_insensitive=True, partial=True)
regex = globs_to_regex([pattern], case_insensitive=True, partial=True)

# Normalize the result: it must end with a path separator.
result_sep = sep(result)
Expand Down
6 changes: 3 additions & 3 deletions coverage/inorout.py
Expand Up @@ -16,7 +16,7 @@
from coverage import env
from coverage.disposition import FileDisposition, disposition_init
from coverage.exceptions import CoverageException, PluginError
from coverage.files import TreeMatcher, FnmatchMatcher, ModuleMatcher
from coverage.files import TreeMatcher, GlobMatcher, ModuleMatcher
from coverage.files import prep_patterns, find_python_files, canonical_filename
from coverage.misc import sys_modules_saved
from coverage.python import source_for_file, source_for_morf
Expand Down Expand Up @@ -260,10 +260,10 @@ def debug(msg):
self.pylib_match = TreeMatcher(self.pylib_paths, "pylib")
debug(f"Python stdlib matching: {self.pylib_match!r}")
if self.include:
self.include_match = FnmatchMatcher(self.include, "include")
self.include_match = GlobMatcher(self.include, "include")
debug(f"Include matching: {self.include_match!r}")
if self.omit:
self.omit_match = FnmatchMatcher(self.omit, "omit")
self.omit_match = GlobMatcher(self.omit, "omit")
debug(f"Omit matching: {self.omit_match!r}")

self.cover_match = TreeMatcher(self.cover_paths, "coverage")
Expand Down
6 changes: 3 additions & 3 deletions coverage/report.py
Expand Up @@ -6,7 +6,7 @@
import sys

from coverage.exceptions import CoverageException, NoDataError, NotPython
from coverage.files import prep_patterns, FnmatchMatcher
from coverage.files import prep_patterns, GlobMatcher
from coverage.misc import ensure_dir_for_file, file_be_gone


Expand Down Expand Up @@ -57,11 +57,11 @@ def get_analysis_to_report(coverage, morfs):
config = coverage.config

if config.report_include:
matcher = FnmatchMatcher(prep_patterns(config.report_include), "report_include")
matcher = GlobMatcher(prep_patterns(config.report_include), "report_include")
file_reporters = [fr for fr in file_reporters if matcher.match(fr.filename)]

if config.report_omit:
matcher = FnmatchMatcher(prep_patterns(config.report_omit), "report_omit")
matcher = GlobMatcher(prep_patterns(config.report_omit), "report_omit")
file_reporters = [fr for fr in file_reporters if not matcher.match(fr.filename)]

if not file_reporters:
Expand Down
40 changes: 23 additions & 17 deletions doc/cmd.rst
Expand Up @@ -342,7 +342,7 @@ single directory, and use the **combine** command to combine them into one

$ coverage combine

You can also name directories or files on the command line::
You can also name directories or files to be combined on the command line::

$ coverage combine data1.dat windows_data_files/

Expand All @@ -364,22 +364,6 @@ An existing combined data file is ignored and re-written. If you want to use
runs, use the ``--append`` switch on the **combine** command. This behavior
was the default before version 4.2.

To combine data for a source file, coverage has to find its data in each of the
data files. Different test runs may run the same source file from different
locations. For example, different operating systems will use different paths
for the same file, or perhaps each Python version is run from a different
subdirectory. Coverage needs to know that different file paths are actually
the same source file for reporting purposes.

You can tell coverage.py how different source locations relate with a
``[paths]`` section in your configuration file (see :ref:`config_paths`).
It might be more convenient to use the ``[run] relative_files``
setting to store relative file paths (see :ref:`relative_files
<config_run_relative_files>`).

If data isn't combining properly, you can see details about the inner workings
with ``--debug=pathmap``.

If any of the data files can't be read, coverage.py will print a warning
indicating the file and the problem.

Expand Down Expand Up @@ -414,6 +398,28 @@ want to keep those files, use the ``--keep`` command-line option.
.. [[[end]]] (checksum: 0bdd83f647ee76363c955bedd9ddf749)
.. _cmd_combine_remapping:

Re-mapping paths
................

To combine data for a source file, coverage has to find its data in each of the
data files. Different test runs may run the same source file from different
locations. For example, different operating systems will use different paths
for the same file, or perhaps each Python version is run from a different
subdirectory. Coverage needs to know that different file paths are actually
the same source file for reporting purposes.

You can tell coverage.py how different source locations relate with a
``[paths]`` section in your configuration file (see :ref:`config_paths`).
It might be more convenient to use the ``[run] relative_files``
setting to store relative file paths (see :ref:`relative_files
<config_run_relative_files>`).

If data isn't combining properly, you can see details about the inner workings
with ``--debug=pathmap``.


.. _cmd_erase:

Erase data: ``coverage erase``
Expand Down
2 changes: 1 addition & 1 deletion doc/config.rst
Expand Up @@ -357,7 +357,7 @@ The first list that has a match will be used.
The ``--debug=pathmap`` option can be used to log details of the re-mapping of
paths. See :ref:`the --debug option <cmd_run_debug>`.

See :ref:`cmd_combine` for more information.
See :ref:`cmd_combine_remapping` and :ref:`source_glob` for more information.


.. _config_report:
Expand Down
29 changes: 24 additions & 5 deletions doc/source.rst
Expand Up @@ -59,10 +59,10 @@ removed from the set.

.. highlight:: ini

The ``include`` and ``omit`` file name patterns follow typical shell syntax:
``*`` matches any number of characters and ``?`` matches a single character.
Patterns that start with a wildcard character are used as-is, other patterns
are interpreted relative to the current directory::
The ``include`` and ``omit`` file name patterns follow common shell syntax,
described below in :ref:`source_glob`. Patterns that start with a wildcard
character are used as-is, other patterns are interpreted relative to the
current directory::

[run]
omit =
Expand All @@ -77,7 +77,7 @@ The ``source``, ``include``, and ``omit`` values all work together to determine
the source that will be measured.

If both ``source`` and ``include`` are set, the ``include`` value is ignored
and a warning is printed on the standard output.
and a warning is issued.


.. _source_reporting:
Expand All @@ -103,3 +103,22 @@ reporting.

Note that these are ways of specifying files to measure. You can also exclude
individual source lines. See :ref:`excluding` for details.


.. _source_glob:

File patterns
-------------

File path patterns are used for include and omit, and for combining path
remapping. They follow common shell syntax:

- ``*`` matches any number of file name characters, not including the directory
separator.

- ``?`` matches a single file name character.

- ``**`` matches any number of nested directory names, including none.

- Both ``/`` and ``\`` will match either a slash or a backslash, to make
cross-platform matching easier.
1 change: 0 additions & 1 deletion tests/test_api.py
Expand Up @@ -71,7 +71,6 @@ def test_unexecuted_file(self):
assert missing == [1]

def test_filenames(self):

self.make_file("mymain.py", """\
import mymod
a = 1
Expand Down

0 comments on commit ec6205a

Please sign in to comment.