perf: hash data files during combining to avoid unneeded work. #1483

When generating many parallel data files, often some data files will be exact copies of each other. Checking the hashes, we can avoid combining the duplicates, speeding the process.
nedbat · Nov 7, 2022 · aaaef95 · aaaef95
1 parent bc630b5
commit aaaef95
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 15 deletions.
diff --git a/coverage/data.py b/coverage/data.py
@@ -11,6 +11,7 @@
 """
 
 import glob
+import hashlib
 import os.path
 
 from coverage.exceptions import CoverageException, NoDataError
@@ -110,6 +111,7 @@ def combine_parallel_data(
     if strict and not files_to_combine:
         raise NoDataError("No data to combine")
 
+    file_hashes = set()
     files_combined = 0
     for f in files_to_combine:
         if f == data.data_filename():
@@ -118,6 +120,25 @@ def combine_parallel_data(
             if data._debug.should('dataio'):
                 data._debug.write(f"Skipping combining ourself: {f!r}")
             continue
+
+        try:
+            rel_file_name = os.path.relpath(f)
+        except ValueError:
+            # ValueError can be raised under Windows when os.getcwd() returns a
+            # folder from a different drive than the drive of f, in which case
+            # we print the original value of f instead of its relative path
+            rel_file_name = f
+
+        with open(f, "rb") as fobj:
+            hasher = hashlib.new("sha3_256")
+            hasher.update(fobj.read())
+            sha = hasher.digest()
+            if sha in file_hashes:
+                if message:
+                    message(f"Skipping duplicate data {rel_file_name}")
+                continue
+            file_hashes.add(sha)
+
         if data._debug.should('dataio'):
             data._debug.write(f"Combining data file {f!r}")
         try:
@@ -132,14 +153,7 @@ def combine_parallel_data(
             data.update(new_data, aliases=aliases)
             files_combined += 1
             if message:
-                try:
-                    file_name = os.path.relpath(f)
-                except ValueError:
-                    # ValueError can be raised under Windows when os.getcwd() returns a
-                    # folder from a different drive than the drive of f, in which case
-                    # we print the original value of f instead of its relative path
-                    file_name = f
-                message(f"Combined data file {file_name}")
+                message(f"Combined data file {rel_file_name}")
             if not keep:
                 if data._debug.should('dataio'):
                     data._debug.write(f"Deleting combined data file {f!r}")

diff --git a/coverage/sqldata.py b/coverage/sqldata.py
@@ -4,7 +4,6 @@
 """SQLite coverage data."""
 
 import collections
-import datetime
 import functools
 import glob
 import itertools
@@ -56,7 +55,6 @@
     --  'has_arcs' boolean      -- Is this data recording branches?
     --  'sys_argv' text         -- The coverage command line that recorded the data.
     --  'version' text          -- The version of coverage.py that made the file.
-    --  'when' text             -- Datetime when the file was created.
 );
 
 CREATE TABLE file (
@@ -305,7 +303,6 @@ def _init_db(self, db):
             [
                 ("sys_argv", str(getattr(sys, "argv", None))),
                 ("version", __version__),
-                ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
             ]
         )
 

diff --git a/doc/dbschema.rst b/doc/dbschema.rst
@@ -70,7 +70,6 @@ This is the database schema:
         --  'has_arcs' boolean      -- Is this data recording branches?
         --  'sys_argv' text         -- The coverage command line that recorded the data.
         --  'version' text          -- The version of coverage.py that made the file.
-        --  'when' text             -- Datetime when the file was created.
     );
 
     CREATE TABLE file (
@@ -116,7 +115,7 @@ This is the database schema:
         foreign key (file_id) references file (id)
     );
 
-.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657)
+.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447)
 
 
 .. _numbits:

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -1362,7 +1362,7 @@ def test_combine_no_usable_files(self):
 
         # Make bogus data files.
         self.make_file(".coverage.bad1", "This isn't a coverage data file.")
-        self.make_file(".coverage.bad2", "This isn't a coverage data file.")
+        self.make_file(".coverage.bad2", "This isn't a coverage data file either.")
 
         # Combine the parallel coverage data files into .coverage, but nothing is readable.
         cov = coverage.Coverage()

diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py
@@ -484,7 +484,10 @@ def try_multiprocessing_code(
             out_lines = out.splitlines()
             assert len(out_lines) == nprocs + 1
             assert all(
-                re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line)
+                re.fullmatch(
+                    r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+",
+                    line
+                )
                 for line in out_lines
             )
             out = self.run_command("coverage report -m")