Merge pull request #597 from lsst/tickets/DM-32459

DM-32459: Perform follow-up queries to improve no-result diagnostics.
lsst · Nov 11, 2021 · d67b257 · d67b257
2 parents 68e898d + d8a4c92
commit d67b257
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 14 deletions.
diff --git a/python/lsst/daf/butler/registry/queries/_query.py b/python/lsst/daf/butler/registry/queries/_query.py
@@ -281,6 +281,7 @@ def explain_no_results(
         self,
         db: Database, *,
         region: Optional[Region] = None,
+        followup: bool = True,
     ) -> Iterator[str]:
         """Return human-readable messages that may help explain why the query
         yields no results.
@@ -293,6 +294,9 @@ def explain_no_results(
             A region that any result-row regions must overlap in order to be
             yielded.  If not provided, this will be ``self.whereRegion``, if
             that exists.
+        followup : `bool`, optional
+            If `True` (default) perform inexpensive follow-up queries if no
+            diagnostics are available from query generation alone.
 
         Returns
         -------
@@ -305,13 +309,11 @@ def explain_no_results(
         Messages related to post-query filtering are only available if `rows`,
         `any`, or `count` was already called with the same region (with
         ``exact=True`` for the latter two).
-
-        At present, this method only returns messages that are generated while
-        the query is being built or filtered.  In the future, it may perform
-        its own new follow-up queries, which users may wish to short-circuit
-        simply by not continuing to iterate over its results.
         """
-        yield from self._doomed_by
+        from ._builder import QueryBuilder
+        if self._doomed_by:
+            yield from self._doomed_by
+            return
         if self._filtered_by_where:
             yield (
                 f"{self._filtered_by_where} result rows were filtered out because "
@@ -322,6 +324,20 @@ def explain_no_results(
                 f"{self._filtered_by_join} result rows were filtered out because "
                 "one or more regions did not overlap."
             )
+        if (not followup) or self._filtered_by_join or self._filtered_by_where:
+            return
+        # Query didn't return results even before client-side filtering, and
+        # caller says we can do follow-up queries to determine why.
+        # Start by seeing if there are _any_ dimension records for each element
+        # involved.
+        for element in self.graph.elements:
+            summary = QuerySummary(element.graph)
+            builder = QueryBuilder(summary, self.managers)
+            followup_query = builder.finish()
+            if not followup_query.any(db, exact=False):
+                yield f"No dimension records for element '{element.name}' found."
+                yield from followup_query.explain_no_results(db, region=region, followup=False)
+                return
 
     @abstractmethod
     def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:

diff --git a/python/lsst/daf/butler/registry/queries/_results.py b/python/lsst/daf/butler/registry/queries/_results.py
@@ -98,6 +98,9 @@ def __init__(self, db: Database, query: Query, *,
     def __iter__(self) -> Iterator[DataCoordinate]:
         return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))
 
+    def __repr__(self) -> str:
+        return f"<DataCoordinate iterator with dimensions={self._query.graph}>"
+
     @property
     def graph(self) -> DimensionGraph:
         # Docstring inherited from DataCoordinateIterable.
@@ -378,10 +381,10 @@ def explain_no_results(self) -> Iterator[str]:
         iterator has been exhausted, or if `any` or `count` was already called
         (with ``exact=True`` for the latter two).
 
-        At present, this method only returns messages that are generated while
-        the query is being built or filtered.  In the future, it may perform
-        its own new follow-up queries, which users may wish to short-circuit
-        simply by not continuing to iterate over its results.
+        This method first yields messages that are generated while the query is
+        being built or filtered, but may then proceed to diagnostics generated
+        by performing what should be inexpensive follow-up queries.  Callers
+        can short-circuit this at any time by simplying not iterating further.
         """
         return self._query.explain_no_results(self._db)
 
@@ -509,10 +512,10 @@ def explain_no_results(self) -> Iterator[str]:
         iterator has been exhausted, or if `any` or `count` was already called
         (with ``exact=True`` for the latter two).
 
-        At present, this method only returns messages that are generated while
-        the query is being built or filtered.  In the future, it may perform
-        its own new follow-up queries, which users may wish to short-circuit
-        simply by not continuing to iterate over its results.
+        This method first yields messages that are generated while the query is
+        being built or filtered, but may then proceed to diagnostics generated
+        by performing what should be inexpensive follow-up queries.  Callers
+        can short-circuit this at any time by simplying not iterating further.
         """
         raise NotImplementedError()
 
@@ -572,6 +575,9 @@ def __iter__(self) -> Iterator[DatasetRef]:
                 else:
                     yield parentRef.makeComponentRef(component)
 
+    def __repr__(self) -> str:
+        return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"
+
     def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
         # Docstring inherited from DatasetQueryResults.
         yield self
@@ -667,6 +673,9 @@ def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterab
     def __iter__(self) -> Iterator[DatasetRef]:
         return itertools.chain.from_iterable(self._chain)
 
+    def __repr__(self) -> str:
+        return "<DatasetRef iterator for multiple dataset types>"
+
     def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
         # Docstring inherited from DatasetQueryResults.
         return iter(self._chain)

diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py
@@ -2428,6 +2428,30 @@ def testQueryResultSummaries(self):
                 ),
                 messages
             )
+
+        # These queries yield no results due to problems that can be identified
+        # by cheap follow-up queries, yielding helpful diagnostics.
+        for query, snippets in [
+            (
+                # No records for one of the involved dimensions.
+                registry.queryDataIds(["subfilter"]),
+                ["dimension records", "subfilter"],
+            ),
+        ]:
+            self.assertFalse(query.any(execute=True, exact=False))
+            self.assertFalse(query.any(execute=True, exact=True))
+            self.assertEqual(query.count(exact=True), 0)
+            messages = list(query.explain_no_results())
+            self.assertTrue(messages)
+            # Want all expected snippets to appear in at least one message.
+            self.assertTrue(
+                any(
+                    all(snippet in message for snippet in snippets)
+                    for message in query.explain_no_results()
+                ),
+                messages
+            )
+
         # This query yields four overlaps in the database, but one is filtered
         # out in postprocessing.  The count queries aren't accurate because
         # they don't account for duplication that happens due to an internal