Skip to content

Commit

Permalink
Merge pull request #13069 from orionlee/fix_perf_ts_aggregate_downsam…
Browse files Browse the repository at this point in the history
…ple_13058

Fix aggregate_downsample() performance degradation
  • Loading branch information
pllim committed Oct 27, 2022
2 parents 40419fa + 6caa5e3 commit b6c89db
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 11 deletions.
51 changes: 40 additions & 11 deletions astropy/timeseries/downsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ def reduceat(array, indices, function):
return np.array(result)


def _to_relative_longdouble(time: Time, rel_base: Time) -> np.longdouble:
# Convert the time objects into plain ndarray
# so that they be used to make various operations faster, including
# - `np.searchsorted()`
# - time comparison.
#
# Relative time in seconds with np.longdouble type is used to:
# - a consistent format for search, irrespective of the format/scale of the inputs,
# - retain the best precision possible
return (time - rel_base).to_value(format="sec", subfmt="long")


def aggregate_downsample(time_series, *, time_bin_size=None, time_bin_start=None,
time_bin_end=None, n_bins=None, aggregate_func=None):
"""
Expand Down Expand Up @@ -158,23 +170,40 @@ def aggregate_downsample(time_series, *, time_bin_size=None, time_bin_start=None
n_bins = len(bin_start)

# Find the subset of the table that is inside the union of all bins
keep = ((ts_sorted.time >= bin_start[0]) & (ts_sorted.time <= bin_end[-1]))

# Find out indices to be removed because of uncontiguous bins
for ind in range(n_bins-1):
delete_indices = np.where(np.logical_and(ts_sorted.time > bin_end[ind],
ts_sorted.time < bin_start[ind+1]))
# - output: `keep` a mask to create the subset
# - use relative time in seconds `np.longdouble`` in in creating `keep` to speed up
# (`Time` object comparison is rather slow)
# - tiny sacrifice on precision (< 0.01ns on 64 bit platform)
rel_base = ts_sorted.time[0]
rel_bin_start = _to_relative_longdouble(bin_start, rel_base)
rel_bin_end = _to_relative_longdouble(bin_end, rel_base)
rel_ts_sorted_time = _to_relative_longdouble(ts_sorted.time, rel_base)
keep = ((rel_ts_sorted_time >= rel_bin_start[0]) & (rel_ts_sorted_time <= rel_bin_end[-1]))

# Find out indices to be removed because of noncontiguous bins
#
# Only need to check when adjacent bins have gaps, i.e.,
# bin_start[ind + 1] > bin_end[ind]
# - see: https://github.com/astropy/astropy/issues/13058#issuecomment-1090846697
# on thoughts on how to reduce the number of times to loop
noncontiguous_bins_indices = np.where(rel_bin_start[1:] > rel_bin_end[:-1])[0]
for ind in noncontiguous_bins_indices:
delete_indices = np.where(np.logical_and(rel_ts_sorted_time > rel_bin_end[ind],
rel_ts_sorted_time < rel_bin_start[ind+1]))
keep[delete_indices] = False

subset = ts_sorted[keep]
rel_subset_time = rel_ts_sorted_time[keep]

# Figure out which bin each row falls in by sorting with respect
# to the bin end times
indices = np.searchsorted(bin_end, ts_sorted.time[keep])
indices = np.searchsorted(rel_bin_end, rel_subset_time)

# For time == bin_start[i+1] == bin_end[i], let bin_start takes precedence
if len(indices) and np.all(bin_start[1:] >= bin_end[:-1]):
indices_start = np.searchsorted(subset.time, bin_start[bin_start <= ts_sorted.time[-1]])
if len(indices) and np.all(rel_bin_start[1:] >= rel_bin_end[:-1]):
indices_start = np.searchsorted(
rel_subset_time,
rel_bin_start[rel_bin_start <= rel_ts_sorted_time[-1]]
)
indices[indices_start] = np.arange(len(indices_start))

# Determine rows where values are defined
Expand All @@ -188,7 +217,7 @@ def aggregate_downsample(time_series, *, time_bin_size=None, time_bin_start=None
unique_indices = np.unique(indices)

# Add back columns

subset = ts_sorted[keep]
for colname in subset.colnames:

if colname == 'time':
Expand Down
32 changes: 32 additions & 0 deletions astropy/timeseries/tests/test_downsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,35 @@ def test_downsample_edge_cases(time, time_bin_start, time_bin_end):
assert down['a'].mask.all() # all bins placed *beyond* the time span of the data
elif ts.time.min() < time_bin_start[1]:
assert down['a'][0] == ts['a'][0] # single-valued time series falls in *first* bin


@pytest.mark.parametrize("diff_from_base",
[1 * u.year, 10 * u.year, 50 * u.year, 100 * u.year])
def test_time_precision_limit(diff_from_base):
"""
A test on time precision limit supported by downsample().
It is related to an implementation details: that time comparison (and sorting indirectly)
is done with relative time for computational efficiency.
The relative time converted has a slight loss of precision, which worsens
as the gap between a time and the base time increases, e.g., when downsampling
a timeseries that combines current observation with archival data years back.
This test is to document the acceptable precision limit.
see also: https://github.com/astropy/astropy/pull/13069#issuecomment-1093069184
"""
precision_limit = 500 * u.ns

from astropy.timeseries.downsample import _to_relative_longdouble

t_base = Time('1980-01-01T12:30:31.000', format='isot', scale='tdb')
t2 = t_base + diff_from_base
t3 = t2 + precision_limit

r_t2 = _to_relative_longdouble(t2, t_base)
r_t3 = _to_relative_longdouble(t3, t_base)

# ensure in the converted relative time,
# t2 and t3 can still be correctly compared
assert r_t3 > r_t2
2 changes: 2 additions & 0 deletions docs/changes/timeseries/13069.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fixed a performance regression in ``timeseries.aggregate_downsample``
introduced in Astropy 5.0 / #11266.

0 comments on commit b6c89db

Please sign in to comment.