diff --git a/coverage/data.py b/coverage/data.py index 4bdfe3010..1a8f35d0c 100644 --- a/coverage/data.py +++ b/coverage/data.py @@ -11,6 +11,7 @@ """ import glob +import hashlib import os.path from coverage.exceptions import CoverageException, NoDataError @@ -110,6 +111,7 @@ def combine_parallel_data( if strict and not files_to_combine: raise NoDataError("No data to combine") + file_hashes = set() files_combined = 0 for f in files_to_combine: if f == data.data_filename(): @@ -118,6 +120,25 @@ def combine_parallel_data( if data._debug.should('dataio'): data._debug.write(f"Skipping combining ourself: {f!r}") continue + + try: + rel_file_name = os.path.relpath(f) + except ValueError: + # ValueError can be raised under Windows when os.getcwd() returns a + # folder from a different drive than the drive of f, in which case + # we print the original value of f instead of its relative path + rel_file_name = f + + with open(f, "rb") as fobj: + hasher = hashlib.new("sha3_256") + hasher.update(fobj.read()) + sha = hasher.digest() + if sha in file_hashes: + if message: + message(f"Skipping duplicate data {rel_file_name}") + continue + file_hashes.add(sha) + if data._debug.should('dataio'): data._debug.write(f"Combining data file {f!r}") try: @@ -132,14 +153,7 @@ def combine_parallel_data( data.update(new_data, aliases=aliases) files_combined += 1 if message: - try: - file_name = os.path.relpath(f) - except ValueError: - # ValueError can be raised under Windows when os.getcwd() returns a - # folder from a different drive than the drive of f, in which case - # we print the original value of f instead of its relative path - file_name = f - message(f"Combined data file {file_name}") + message(f"Combined data file {rel_file_name}") if not keep: if data._debug.should('dataio'): data._debug.write(f"Deleting combined data file {f!r}") diff --git a/coverage/sqldata.py b/coverage/sqldata.py index 2b7730537..2fbc53f5c 100644 --- a/coverage/sqldata.py +++ b/coverage/sqldata.py @@ -4,7 +4,6 @@ """SQLite coverage data.""" import collections -import datetime import functools import glob import itertools @@ -56,7 +55,6 @@ -- 'has_arcs' boolean -- Is this data recording branches? -- 'sys_argv' text -- The coverage command line that recorded the data. -- 'version' text -- The version of coverage.py that made the file. - -- 'when' text -- Datetime when the file was created. ); CREATE TABLE file ( @@ -305,7 +303,6 @@ def _init_db(self, db): [ ("sys_argv", str(getattr(sys, "argv", None))), ("version", __version__), - ("when", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")), ] ) diff --git a/doc/dbschema.rst b/doc/dbschema.rst index 34e0a55da..42e616d98 100644 --- a/doc/dbschema.rst +++ b/doc/dbschema.rst @@ -70,7 +70,6 @@ This is the database schema: -- 'has_arcs' boolean -- Is this data recording branches? -- 'sys_argv' text -- The coverage command line that recorded the data. -- 'version' text -- The version of coverage.py that made the file. - -- 'when' text -- Datetime when the file was created. ); CREATE TABLE file ( @@ -116,7 +115,7 @@ This is the database schema: foreign key (file_id) references file (id) ); -.. [[[end]]] (checksum: cfce1df016afbb43a5ff94306db56657) +.. [[[end]]] (checksum: 9d87794485a9aa6d9064b735972a3447) .. _numbits: diff --git a/tests/test_api.py b/tests/test_api.py index ce44b9b1c..195452323 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1362,7 +1362,7 @@ def test_combine_no_usable_files(self): # Make bogus data files. self.make_file(".coverage.bad1", "This isn't a coverage data file.") - self.make_file(".coverage.bad2", "This isn't a coverage data file.") + self.make_file(".coverage.bad2", "This isn't a coverage data file either.") # Combine the parallel coverage data files into .coverage, but nothing is readable. cov = coverage.Coverage() diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py index 0a51d4d96..30dae136c 100644 --- a/tests/test_concurrency.py +++ b/tests/test_concurrency.py @@ -484,7 +484,10 @@ def try_multiprocessing_code( out_lines = out.splitlines() assert len(out_lines) == nprocs + 1 assert all( - re.fullmatch(r"Combined data file \.coverage\..*\.\d+\.\d+", line) + re.fullmatch( + r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.\d+", + line + ) for line in out_lines ) out = self.run_command("coverage report -m")