fix: lcov report fixes (#1849)

* [lcov] Style improvements to lcovreport.py - Rename LcovReporter.get_lcov to LcovReporter.lcov_file because the old name sounds like it returns something. lcov_file was selected by analogy with XmlReporter.xml_file. - Move the increment of self.total from lcov_file to the loop in LcovReporter.report for better separation of concerns. * [lcov] honor skip_empty; improve consistency with xml2lcov Implement skip_empty mode in LcovReporter. Also, improve consistency with what you get by running ‘coverage xml’ and then ‘xml2lcov’ by not emitting any vacuous TN: lines, not emitting LF:0 LH:0 for empty files, and not emitting BRF:0 BRH:0 for files with no branches. * [lcov] don’t write DA-line checksums by default DA-line checksums bulk up the .lcov file and provide only a weak assurance that the source file being processed for analysis matches the source file that was used to generate coverage data. Current versions of the LCOV suite discourage their use. Add a boolean configuration option, lcov.line_checksums, which controls whether checksums are generated for DA lines. Consistent with the current behavior of the LCOV suite, the default is to not generate any checksums. * [lcov] Re-implement lcov reports using the same algorithm as XML reports. This fixes five serious bugs: - The first field of a BRDA: line may not be zero (#1846). - The first field of a BRDA: line is supposed to be the *source* line of each instrumented branch, not the destination line. - The fourth field of a BRDA: line is supposed to be “-” when the branch was *never reached*, not when it was reached but never/always taken (which is what a branch’s presence in missing_arcs means). As far as I can tell, coverage.py currently doesn’t know of the existence of branches that were never reached. - The decision of whether to emit DA: and BRDA: lines at all is now taken strictly according to what’s in analysis.statements. This is important because some lines may appear in analysis.executed and/or analysis.executed_branch_arcs but *not* in analysis.statements. For example, the beginnings of docstrings are like this, as is the phantom line 1 of an empty __init__.py in Python 3.10 and earlier. (I am pleased to note that the special casing of empty __init__.py in the test suite is no longer required after this patch.) - We no longer attempt to call branch-coverage-related Analysis methods when analysis.has_arcs is false. And two minor annoyances: - DA: and BRDA: lines are now emitted strictly in ascending order by (source) line number. - Source file records are now sorted by *relative* pathname, not absolute pathname from the coverage database. --------- Co-authored-by: Zack Weinberg <[email protected]>
nedbat · Sep 11, 2024 · 21d3e31 · 21d3e31
1 parent 595ea22
commit 21d3e31
Show file tree

Hide file tree

Showing 6 changed files with 204 additions and 165 deletions.
diff --git a/coverage/config.py b/coverage/config.py
@@ -250,6 +250,7 @@ def __init__(self) -> None:
 
         # Defaults for [lcov]
         self.lcov_output = "coverage.lcov"
+        self.lcov_line_checksums = False
 
         # Defaults for [paths]
         self.paths: dict[str, list[str]] = {}
@@ -428,6 +429,7 @@ def copy(self) -> CoverageConfig:
 
         # [lcov]
         ("lcov_output", "lcov:output"),
+        ("lcov_line_checksums", "lcov:line_checksums", "boolean")
     ]
 
     def _set_attr_from_config_option(

diff --git a/coverage/env.py b/coverage/env.py
@@ -99,10 +99,6 @@ class PYBEHAVIOR:
     # Some words are keywords in some places, identifiers in other places.
     soft_keywords = (PYVERSION >= (3, 10))
 
-    # Modules start with a line numbered zero. This means empty modules have
-    # only a 0-number line, which is ignored, giving a truly empty module.
-    empty_is_empty = (PYVERSION >= (3, 11, 0, "beta", 4))
-
     # PEP669 Low Impact Monitoring: https://peps.python.org/pep-0669/
     pep669 = bool(getattr(sys, "monitoring", None))
 

diff --git a/coverage/lcovreport.py b/coverage/lcovreport.py
@@ -22,9 +22,12 @@
 
 def line_hash(line: str) -> str:
     """Produce a hash of a source line for use in the LCOV file."""
-    # The LCOV file format requires MD5 as a fingerprint of the file. This is
-    # not a security use.  Some security scanners raise alarms about the use of
-    # MD5 here, but it is a false positive. This is not a security concern.
+    # The LCOV file format optionally allows each line to be MD5ed as a
+    # fingerprint of the file.  This is not a security use.  Some security
+    # scanners raise alarms about the use of MD5 here, but it is a false
+    # positive.  This is not a security concern.
+    # The unusual encoding of the MD5 hash, as a base64 sequence with the
+    # trailing = signs stripped, is specified by the LCOV file format.
     hashed = hashlib.md5(line.encode("utf-8")).digest()
     return base64.b64encode(hashed).decode("ascii").rstrip("=")
 
@@ -36,6 +39,7 @@ class LcovReporter:
 
     def __init__(self, coverage: Coverage) -> None:
         self.coverage = coverage
+        self.config = coverage.config
         self.total = Numbers(self.coverage.config.precision)
 
     def report(self, morfs: Iterable[TMorf] | None, outfile: IO[str]) -> float:
@@ -49,85 +53,104 @@ def report(self, morfs: Iterable[TMorf] | None, outfile: IO[str]) -> float:
         self.coverage.get_data()
         outfile = outfile or sys.stdout
 
-        for fr, analysis in get_analysis_to_report(self.coverage, morfs):
-            self.get_lcov(fr, analysis, outfile)
+        # ensure file records are sorted by the _relative_ filename, not the full path
+        to_report = [
+            (fr.relative_filename(), fr, analysis)
+            for fr, analysis in get_analysis_to_report(self.coverage, morfs)
+        ]
+        to_report.sort()
+
+        for fname, fr, analysis in to_report:
+            self.total += analysis.numbers
+            self.lcov_file(fname, fr, analysis, outfile)
 
         return self.total.n_statements and self.total.pc_covered
 
-    def get_lcov(self, fr: FileReporter, analysis: Analysis, outfile: IO[str]) -> None:
+    def lcov_file(
+        self,
+        rel_fname: str,
+        fr: FileReporter,
+        analysis: Analysis,
+        outfile: IO[str],
+    ) -> None:
         """Produces the lcov data for a single file.
 
         This currently supports both line and branch coverage,
         however function coverage is not supported.
         """
-        self.total += analysis.numbers
-
-        outfile.write("TN:\n")
-        outfile.write(f"SF:{fr.relative_filename()}\n")
-        source_lines = fr.source().splitlines()
-        for covered in sorted(analysis.executed):
-            if covered in analysis.excluded:
-                # Do not report excluded as executed
-                continue
-            # Note: Coverage.py currently only supports checking *if* a line
-            # has been executed, not how many times, so we set this to 1 for
-            # nice output even if it's technically incorrect.
-
-            # The lines below calculate a 64-bit encoded md5 hash of the line
-            # corresponding to the DA lines in the lcov file, for either case
-            # of the line being covered or missed in coverage.py. The final two
-            # characters of the encoding ("==") are removed from the hash to
-            # allow genhtml to run on the resulting lcov file.
-            if source_lines:
-                if covered-1 >= len(source_lines):
-                    break
-                line = source_lines[covered-1]
-            else:
-                line = ""
-            outfile.write(f"DA:{covered},1,{line_hash(line)}\n")
-
-        for missed in sorted(analysis.missing):
-            # We don't have to skip excluded lines here, because `missing`
-            # already doesn't have them.
-            assert source_lines
-            line = source_lines[missed-1]
-            outfile.write(f"DA:{missed},0,{line_hash(line)}\n")
-
-        outfile.write(f"LF:{analysis.numbers.n_statements}\n")
-        outfile.write(f"LH:{analysis.numbers.n_executed}\n")
-
-        # More information dense branch coverage data.
-        missing_arcs = analysis.missing_branch_arcs()
-        executed_arcs = analysis.executed_branch_arcs()
-        for block_number, block_line_number in enumerate(
-            sorted(analysis.branch_stats().keys()),
-        ):
-            for branch_number, line_number in enumerate(
-                sorted(missing_arcs[block_line_number]),
-            ):
-                # The exit branches have a negative line number,
-                # this will not produce valid lcov. Setting
-                # the line number of the exit branch to 0 will allow
-                # for valid lcov, while preserving the data.
-                line_number = max(line_number, 0)
-                outfile.write(f"BRDA:{line_number},{block_number},{branch_number},-\n")
-
-            # The start value below allows for the block number to be
-            # preserved between these two for loops (stopping the loop from
-            # resetting the value of the block number to 0).
-            for branch_number, line_number in enumerate(
-                sorted(executed_arcs[block_line_number]),
-                start=len(missing_arcs[block_line_number]),
-            ):
-                line_number = max(line_number, 0)
-                outfile.write(f"BRDA:{line_number},{block_number},{branch_number},1\n")
-
-        # Summary of the branch coverage.
+
+        if analysis.numbers.n_statements == 0:
+            if self.config.skip_empty:
+                return
+
+        outfile.write(f"SF:{rel_fname}\n")
+
+        if self.config.lcov_line_checksums:
+            source_lines = fr.source().splitlines()
+
+        # Emit a DA: record for each line of the file.
+        lines = sorted(analysis.statements)
+        hash_suffix = ""
+        for line in lines:
+            if self.config.lcov_line_checksums:
+                hash_suffix = "," + line_hash(source_lines[line-1])
+            # Q: can we get info about the number of times a statement is
+            # executed?  If so, that should be recorded here.
+            hit = int(line not in analysis.missing)
+            outfile.write(f"DA:{line},{hit}{hash_suffix}\n")
+
+        if analysis.numbers.n_statements > 0:
+            outfile.write(f"LF:{analysis.numbers.n_statements}\n")
+            outfile.write(f"LH:{analysis.numbers.n_executed}\n")
+
+        # More information dense branch coverage data, if available.
         if analysis.has_arcs:
             branch_stats = analysis.branch_stats()
+            executed_arcs = analysis.executed_branch_arcs()
+            missing_arcs = analysis.missing_branch_arcs()
+
+            for line in lines:
+                if line in branch_stats:
+                    # The meaning of a BRDA: line is not well explained in the lcov
+                    # documentation.  Based on what genhtml does with them, however,
+                    # the interpretation is supposed to be something like this:
+                    # BRDA: <line>, <block>, <branch>, <hit>
+                    # where <line> is the source line number of the *origin* of the
+                    # branch; <block> is an arbitrary number which distinguishes multiple
+                    # control flow operations on a single line; <branch> is an arbitrary
+                    # number which distinguishes the possible destinations of the specific
+                    # control flow operation identified by <line> + <block>; and <hit> is
+                    # either the hit count for <line> + <block> + <branch> or "-" meaning
+                    # that <line> + <block> was never *reached*.  <line> must be >= 1,
+                    # and <block>, <branch>, <hit> must be >= 0.
+
+                    # This is only one possible way to map our sets of executed and
+                    # not-executed arcs to BRDA codes. It seems to produce reasonable
+                    # results when fed through genhtml.
+
+                    # Q: can we get counts of the number of times each arc was executed?
+                    # branch_stats has "total" and "taken" counts for each branch, but it
+                    # doesn't have "taken" broken down by destination.
+                    destinations = {}
+                    for dst in executed_arcs[line]:
+                        destinations[(int(dst < 0), abs(dst))] = 1
+                    for dst in missing_arcs[line]:
+                        destinations[(int(dst < 0), abs(dst))] = 0
+
+                    if all(v == 0 for v in destinations.values()):
+                        # When _none_ of the out arcs from 'line' were executed, presume
+                        # 'line' was never reached.
+                        for branch, _ in enumerate(sorted(destinations.keys())):
+                            outfile.write(f"BRDA:{line},0,{branch},-\n")
+                    else:
+                        for branch, (_, hit) in enumerate(sorted(destinations.items())):
+                            outfile.write(f"BRDA:{line},0,{branch},{hit}\n")
+
+            # Summary of the branch coverage.
             brf = sum(t for t, k in branch_stats.values())
             brh = brf - sum(t - k for t, k in branch_stats.values())
-            outfile.write(f"BRF:{brf}\n")
-            outfile.write(f"BRH:{brh}\n")
+            if brf > 0:
+                outfile.write(f"BRF:{brf}\n")
+                outfile.write(f"BRH:{brh}\n")
 
         outfile.write("end_of_record\n")
diff --git a/doc/config.rst b/doc/config.rst
@@ -890,3 +890,14 @@ Settings particular to LCOV reporting (see :ref:`cmd_lcov`).
 .............
 
 (string, default "coverage.lcov") Where to write the LCOV file.
+
+[lcov] line_checksums
+.....................
+
+(boolean, default false) Whether to write per-line checksums as part of the
+lcov file.  Because these checksums cover only lines with actual code on
+them, and do not verify the ordering of lines, they provide only a weak
+assurance that the source code available to analysis tools (e.g. ``genhtml``)
+matches the code that was used to generate the coverage data.
+
+.. versionadded:: 7.6.2