Merge pull request #13069 from orionlee/fix_perf_ts_aggregate_downsam…

…ple_13058 Fix aggregate_downsample() performance degradation
astropy · Oct 27, 2022 · b6c89db · b6c89db
2 parents 40419fa + 6caa5e3
commit b6c89db
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 11 deletions.
diff --git a/astropy/timeseries/downsample.py b/astropy/timeseries/downsample.py
@@ -33,6 +33,18 @@ def reduceat(array, indices, function):
         return np.array(result)
 
 
+def _to_relative_longdouble(time: Time, rel_base: Time) -> np.longdouble:
+    # Convert the time objects into plain ndarray
+    # so that they be used to make various operations faster, including
+    # - `np.searchsorted()`
+    # - time comparison.
+    #
+    # Relative time in seconds with np.longdouble type is used to:
+    # - a consistent format for search, irrespective of the format/scale of the inputs,
+    # - retain the best precision possible
+    return (time - rel_base).to_value(format="sec", subfmt="long")
+
+
 def aggregate_downsample(time_series, *, time_bin_size=None, time_bin_start=None,
                          time_bin_end=None, n_bins=None, aggregate_func=None):
     """
@@ -158,23 +170,40 @@ def aggregate_downsample(time_series, *, time_bin_size=None, time_bin_start=None
         n_bins = len(bin_start)
 
     # Find the subset of the table that is inside the union of all bins
-    keep = ((ts_sorted.time >= bin_start[0]) & (ts_sorted.time <= bin_end[-1]))
-
-    # Find out indices to be removed because of uncontiguous bins
-    for ind in range(n_bins-1):
-        delete_indices = np.where(np.logical_and(ts_sorted.time > bin_end[ind],
-                                                 ts_sorted.time < bin_start[ind+1]))
+    # - output: `keep` a mask to create the subset
+    # - use relative time in seconds `np.longdouble`` in in creating `keep` to speed up
+    #   (`Time` object comparison is rather slow)
+    # - tiny sacrifice on precision (< 0.01ns on 64 bit platform)
+    rel_base = ts_sorted.time[0]
+    rel_bin_start = _to_relative_longdouble(bin_start, rel_base)
+    rel_bin_end = _to_relative_longdouble(bin_end, rel_base)
+    rel_ts_sorted_time = _to_relative_longdouble(ts_sorted.time, rel_base)
+    keep = ((rel_ts_sorted_time >= rel_bin_start[0]) & (rel_ts_sorted_time <= rel_bin_end[-1]))
+
+    # Find out indices to be removed because of noncontiguous bins
+    #
+    # Only need to check when adjacent bins have gaps, i.e.,
+    # bin_start[ind + 1] > bin_end[ind]
+    # - see: https://github.com/astropy/astropy/issues/13058#issuecomment-1090846697
+    #   on thoughts on how to reduce the number of times to loop
+    noncontiguous_bins_indices = np.where(rel_bin_start[1:] > rel_bin_end[:-1])[0]
+    for ind in noncontiguous_bins_indices:
+        delete_indices = np.where(np.logical_and(rel_ts_sorted_time > rel_bin_end[ind],
+                                                 rel_ts_sorted_time < rel_bin_start[ind+1]))
         keep[delete_indices] = False
 
-    subset = ts_sorted[keep]
+    rel_subset_time = rel_ts_sorted_time[keep]
 
     # Figure out which bin each row falls in by sorting with respect
     # to the bin end times
-    indices = np.searchsorted(bin_end, ts_sorted.time[keep])
+    indices = np.searchsorted(rel_bin_end, rel_subset_time)
 
     # For time == bin_start[i+1] == bin_end[i], let bin_start takes precedence
-    if len(indices) and np.all(bin_start[1:] >= bin_end[:-1]):
-        indices_start = np.searchsorted(subset.time, bin_start[bin_start <= ts_sorted.time[-1]])
+    if len(indices) and np.all(rel_bin_start[1:] >= rel_bin_end[:-1]):
+        indices_start = np.searchsorted(
+            rel_subset_time,
+            rel_bin_start[rel_bin_start <= rel_ts_sorted_time[-1]]
+        )
         indices[indices_start] = np.arange(len(indices_start))
 
     # Determine rows where values are defined
@@ -188,7 +217,7 @@ def aggregate_downsample(time_series, *, time_bin_size=None, time_bin_start=None
     unique_indices = np.unique(indices)
 
     # Add back columns
-
+    subset = ts_sorted[keep]
     for colname in subset.colnames:
 
         if colname == 'time':

diff --git a/astropy/timeseries/tests/test_downsample.py b/astropy/timeseries/tests/test_downsample.py
@@ -192,3 +192,35 @@ def test_downsample_edge_cases(time, time_bin_start, time_bin_end):
         assert down['a'].mask.all()        # all bins placed *beyond* the time span of the data
     elif ts.time.min() < time_bin_start[1]:
         assert down['a'][0] == ts['a'][0]  # single-valued time series falls in *first* bin
+
+
+@pytest.mark.parametrize("diff_from_base",
+                         [1 * u.year, 10 * u.year, 50 * u.year, 100 * u.year])
+def test_time_precision_limit(diff_from_base):
+    """
+    A test on time precision limit supported by downsample().
+
+    It is related to an implementation details: that time comparison (and sorting indirectly)
+    is done with relative time for computational efficiency.
+    The relative time converted has a slight loss of precision, which worsens
+    as the gap between a time and the base time increases, e.g., when downsampling
+    a timeseries that combines current observation with archival data years back.
+
+    This test is to document the acceptable precision limit.
+
+    see also: https://github.com/astropy/astropy/pull/13069#issuecomment-1093069184
+    """
+    precision_limit = 500 * u.ns
+
+    from astropy.timeseries.downsample import _to_relative_longdouble
+
+    t_base = Time('1980-01-01T12:30:31.000', format='isot', scale='tdb')
+    t2 = t_base + diff_from_base
+    t3 = t2 + precision_limit
+
+    r_t2 = _to_relative_longdouble(t2, t_base)
+    r_t3 = _to_relative_longdouble(t3, t_base)
+
+    # ensure in the converted relative time,
+    # t2 and t3 can still be correctly compared
+    assert r_t3 > r_t2
diff --git a/docs/changes/timeseries/13069.bugfix.rst b/docs/changes/timeseries/13069.bugfix.rst
@@ -0,0 +1,2 @@
+Fixed a performance regression in ``timeseries.aggregate_downsample``
+introduced in Astropy 5.0 / #11266.