Skip to content

Commit

Permalink
perf: hash data files during combining to avoid unneeded work. #1483
Browse files Browse the repository at this point in the history
When generating many parallel data files, often some data files will be exact
copies of each other.  Checking the hashes, we can avoid combining the
duplicates, speeding the process.

On a coverage.py metacov, we had 651 duplicates out of 2189 files (29%).
The time to combine was reduced by 17%.
  • Loading branch information
nedbat committed Nov 8, 2022
1 parent bc630b5 commit 09bc2d6
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 33 deletions.
5 changes: 5 additions & 0 deletions CHANGES.rst
Expand Up @@ -29,6 +29,11 @@ Unreleased
- Using ``--format=total`` will write a single total number to the
output. This can be useful for making badges or writing status updates.

- Combining data files with ``coverage combine`` now quickly hashes the data
files to skip files that provide no new information. This can reduce the
time needed. For coverage.py's own test suite, combining was about 17%
faster.

- An empty file has a coverage total of 100%, but used to fail with
``--fail-under``. This has been fixed, closing `issue 1470`_.

Expand Down
71 changes: 45 additions & 26 deletions coverage/data.py
Expand Up @@ -11,6 +11,7 @@
"""

import glob
import hashlib
import os.path

from coverage.exceptions import CoverageException, NoDataError
Expand Down Expand Up @@ -110,42 +111,60 @@ def combine_parallel_data(
if strict and not files_to_combine:
raise NoDataError("No data to combine")

files_combined = 0
file_hashes = set()
combined_any = False

for f in files_to_combine:
if f == data.data_filename():
# Sometimes we are combining into a file which is one of the
# parallel files. Skip that file.
if data._debug.should('dataio'):
data._debug.write(f"Skipping combining ourself: {f!r}")
continue
if data._debug.should('dataio'):
data._debug.write(f"Combining data file {f!r}")

try:
new_data = CoverageData(f, debug=data._debug)
new_data.read()
except CoverageException as exc:
if data._warn:
# The CoverageException has the file name in it, so just
# use the message as the warning.
data._warn(str(exc))
rel_file_name = os.path.relpath(f)
except ValueError:
# ValueError can be raised under Windows when os.getcwd() returns a
# folder from a different drive than the drive of f, in which case
# we print the original value of f instead of its relative path
rel_file_name = f

with open(f, "rb") as fobj:
hasher = hashlib.new("sha3_256")
hasher.update(fobj.read())
sha = hasher.digest()
combine_this_one = sha not in file_hashes

delete_this_one = not keep
if combine_this_one:
if data._debug.should('dataio'):
data._debug.write(f"Combining data file {f!r}")
file_hashes.add(sha)
try:
new_data = CoverageData(f, debug=data._debug)
new_data.read()
except CoverageException as exc:
if data._warn:
# The CoverageException has the file name in it, so just
# use the message as the warning.
data._warn(str(exc))
delete_this_one = False
else:
data.update(new_data, aliases=aliases)
combined_any = True
if message:
message(f"Combined data file {rel_file_name}")
else:
data.update(new_data, aliases=aliases)
files_combined += 1
if message:
try:
file_name = os.path.relpath(f)
except ValueError:
# ValueError can be raised under Windows when os.getcwd() returns a
# folder from a different drive than the drive of f, in which case
# we print the original value of f instead of its relative path
file_name = f
message(f"Combined data file {file_name}")
if not keep:
if data._debug.should('dataio'):
data._debug.write(f"Deleting combined data file {f!r}")
file_be_gone(f)

if strict and not files_combined:
message(f"Skipping duplicate data {rel_file_name}")

if delete_this_one:
if data._debug.should('dataio'):
data._debug.write(f"Deleting data file {f!r}")
file_be_gone(f)

if strict and not combined_any:
raise NoDataError("No usable data files")


Expand Down
3 changes: 0 additions & 3 deletions coverage/sqldata.py
Expand Up @@ -4,7 +4,6 @@
"""SQLite coverage data."""

import collections
import datetime
import functools
import glob
import itertools
Expand Down Expand Up @@ -56,7 +55,6 @@
-- 'has_arcs' boolean -- Is this data recording branches?
-- 'sys_argv' text -- The coverage command line that recorded the data.
-- 'version' text -- The version of coverage.py that made the file.
-- 'when' text -- Datetime when the file was created.
);
CREATE TABLE file (
Expand Down Expand Up @@ -305,7 +303,6 @@ def _init_db(self, db):
[
("sys_argv", str(getattr(sys, "argv", None))),
("version", __version__),
("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
]
)

Expand Down
3 changes: 1 addition & 2 deletions doc/dbschema.rst
Expand Up @@ -70,7 +70,6 @@ This is the database schema:
-- 'has_arcs' boolean -- Is this data recording branches?
-- 'sys_argv' text -- The coverage command line that recorded the data.
-- 'version' text -- The version of coverage.py that made the file.
-- 'when' text -- Datetime when the file was created.
);
CREATE TABLE file (
Expand Down Expand Up @@ -116,7 +115,7 @@ This is the database schema:
foreign key (file_id) references file (id)
);
.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657)
.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447)
.. _numbits:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_api.py
Expand Up @@ -1362,7 +1362,7 @@ def test_combine_no_usable_files(self):

# Make bogus data files.
self.make_file(".coverage.bad1", "This isn't a coverage data file.")
self.make_file(".coverage.bad2", "This isn't a coverage data file.")
self.make_file(".coverage.bad2", "This isn't a coverage data file either.")

# Combine the parallel coverage data files into .coverage, but nothing is readable.
cov = coverage.Coverage()
Expand Down
6 changes: 5 additions & 1 deletion tests/test_concurrency.py
Expand Up @@ -484,9 +484,13 @@ def try_multiprocessing_code(
out_lines = out.splitlines()
assert len(out_lines) == nprocs + 1
assert all(
re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
re.fullmatch(
r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
line
)
for line in out_lines
)
assert len(glob.glob(".coverage.*")) == 0
out = self.run_command("coverage report -m")

last_line = self.squeezed_lines(out)[-1]
Expand Down

0 comments on commit 09bc2d6

Please sign in to comment.