Skip to content

Commit

Permalink
Merge pull request #597 from lsst/tickets/DM-32459
Browse files Browse the repository at this point in the history
DM-32459: Perform follow-up queries to improve no-result diagnostics.
  • Loading branch information
TallJimbo committed Nov 11, 2021
2 parents 68e898d + d8a4c92 commit d67b257
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 14 deletions.
28 changes: 22 additions & 6 deletions python/lsst/daf/butler/registry/queries/_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ def explain_no_results(
self,
db: Database, *,
region: Optional[Region] = None,
followup: bool = True,
) -> Iterator[str]:
"""Return human-readable messages that may help explain why the query
yields no results.
Expand All @@ -293,6 +294,9 @@ def explain_no_results(
A region that any result-row regions must overlap in order to be
yielded. If not provided, this will be ``self.whereRegion``, if
that exists.
followup : `bool`, optional
If `True` (default) perform inexpensive follow-up queries if no
diagnostics are available from query generation alone.
Returns
-------
Expand All @@ -305,13 +309,11 @@ def explain_no_results(
Messages related to post-query filtering are only available if `rows`,
`any`, or `count` was already called with the same region (with
``exact=True`` for the latter two).
At present, this method only returns messages that are generated while
the query is being built or filtered. In the future, it may perform
its own new follow-up queries, which users may wish to short-circuit
simply by not continuing to iterate over its results.
"""
yield from self._doomed_by
from ._builder import QueryBuilder
if self._doomed_by:
yield from self._doomed_by
return
if self._filtered_by_where:
yield (
f"{self._filtered_by_where} result rows were filtered out because "
Expand All @@ -322,6 +324,20 @@ def explain_no_results(
f"{self._filtered_by_join} result rows were filtered out because "
"one or more regions did not overlap."
)
if (not followup) or self._filtered_by_join or self._filtered_by_where:
return
# Query didn't return results even before client-side filtering, and
# caller says we can do follow-up queries to determine why.
# Start by seeing if there are _any_ dimension records for each element
# involved.
for element in self.graph.elements:
summary = QuerySummary(element.graph)
builder = QueryBuilder(summary, self.managers)
followup_query = builder.finish()
if not followup_query.any(db, exact=False):
yield f"No dimension records for element '{element.name}' found."
yield from followup_query.explain_no_results(db, region=region, followup=False)
return

@abstractmethod
def getDatasetColumns(self) -> Optional[DatasetQueryColumns]:
Expand Down
25 changes: 17 additions & 8 deletions python/lsst/daf/butler/registry/queries/_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ def __init__(self, db: Database, query: Query, *,
def __iter__(self) -> Iterator[DataCoordinate]:
return (self._query.extractDataId(row, records=self._records) for row in self._query.rows(self._db))

def __repr__(self) -> str:
return f"<DataCoordinate iterator with dimensions={self._query.graph}>"

@property
def graph(self) -> DimensionGraph:
# Docstring inherited from DataCoordinateIterable.
Expand Down Expand Up @@ -378,10 +381,10 @@ def explain_no_results(self) -> Iterator[str]:
iterator has been exhausted, or if `any` or `count` was already called
(with ``exact=True`` for the latter two).
At present, this method only returns messages that are generated while
the query is being built or filtered. In the future, it may perform
its own new follow-up queries, which users may wish to short-circuit
simply by not continuing to iterate over its results.
This method first yields messages that are generated while the query is
being built or filtered, but may then proceed to diagnostics generated
by performing what should be inexpensive follow-up queries. Callers
can short-circuit this at any time by simplying not iterating further.
"""
return self._query.explain_no_results(self._db)

Expand Down Expand Up @@ -509,10 +512,10 @@ def explain_no_results(self) -> Iterator[str]:
iterator has been exhausted, or if `any` or `count` was already called
(with ``exact=True`` for the latter two).
At present, this method only returns messages that are generated while
the query is being built or filtered. In the future, it may perform
its own new follow-up queries, which users may wish to short-circuit
simply by not continuing to iterate over its results.
This method first yields messages that are generated while the query is
being built or filtered, but may then proceed to diagnostics generated
by performing what should be inexpensive follow-up queries. Callers
can short-circuit this at any time by simplying not iterating further.
"""
raise NotImplementedError()

Expand Down Expand Up @@ -572,6 +575,9 @@ def __iter__(self) -> Iterator[DatasetRef]:
else:
yield parentRef.makeComponentRef(component)

def __repr__(self) -> str:
return f"<DatasetRef iterator for [components of] {self._datasetType.name}>"

def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
# Docstring inherited from DatasetQueryResults.
yield self
Expand Down Expand Up @@ -667,6 +673,9 @@ def __init__(self, chain: Sequence[ParentDatasetQueryResults], doomed_by: Iterab
def __iter__(self) -> Iterator[DatasetRef]:
return itertools.chain.from_iterable(self._chain)

def __repr__(self) -> str:
return "<DatasetRef iterator for multiple dataset types>"

def byParentDatasetType(self) -> Iterator[ParentDatasetQueryResults]:
# Docstring inherited from DatasetQueryResults.
return iter(self._chain)
Expand Down
24 changes: 24 additions & 0 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2428,6 +2428,30 @@ def testQueryResultSummaries(self):
),
messages
)

# These queries yield no results due to problems that can be identified
# by cheap follow-up queries, yielding helpful diagnostics.
for query, snippets in [
(
# No records for one of the involved dimensions.
registry.queryDataIds(["subfilter"]),
["dimension records", "subfilter"],
),
]:
self.assertFalse(query.any(execute=True, exact=False))
self.assertFalse(query.any(execute=True, exact=True))
self.assertEqual(query.count(exact=True), 0)
messages = list(query.explain_no_results())
self.assertTrue(messages)
# Want all expected snippets to appear in at least one message.
self.assertTrue(
any(
all(snippet in message for snippet in snippets)
for message in query.explain_no_results()
),
messages
)

# This query yields four overlaps in the database, but one is filtered
# out in postprocessing. The count queries aren't accurate because
# they don't account for duplication that happens due to an internal
Expand Down

0 comments on commit d67b257

Please sign in to comment.