Skip to content

Commit

Permalink
ENH: Infinite df approximation for Studentized Range PDF (#15444)
Browse files Browse the repository at this point in the history
* ENH: add infinite PDF integrator to Studentized Range Distribution

The studentized range distribution's PDF integrator has trouble with high df inputs. This adds another, simpler, integrator to approximate those extreme cases. It is derived from the previously added CDF approximation.

* DOC: update Studentized Range doc with infinite PDF

* TST: Infinite PDF and CDF tests switching and (approximate) accuracy.

* DOC: fix typos with pdf approximation equation

* STY: fix comment line length

* TST: ignore warnings when testing moments

This attempts to fix flaky moment tests that fail with an invalid value encountered warning. If the output is truly incorrect, it will be caught by result comparison.

* Ignore np warnings in more stud. range tests

Co-authored-by: Dominic C <dominicchm@gmail.com>
  • Loading branch information
DominicChm and Dominic C committed Jun 7, 2022
1 parent 56039d9 commit 7f941db
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 12 deletions.
10 changes: 9 additions & 1 deletion doc/source/tutorial/stats/continuous_studentized_range.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ This distribution has two shape parameters, :math:`k>1` and :math:`\nu>0`, and t
Note: :math:`\phi(z)` and :math:`\Phi(z)` represent the normal PDF and normal CDF, respectively.

When :math:`\nu` exceeds 100,000, the asymptopic approximation of :math:`F(x; k, \nu=\infty)` is used:
When :math:`\nu` exceeds 100,000, the asymptotic approximation of :math:`F(x; k, \nu=\infty)` or :math:`f(x; k, \nu=\infty)` is used:

.. math::
:nowrap:
Expand All @@ -34,5 +34,13 @@ When :math:`\nu` exceeds 100,000, the asymptopic approximation of :math:`F(x; k,
[\Phi(x + z) - \Phi(z)]^{k-1} \,dz
\end{eqnarray*}
.. math::
:nowrap:
\begin{eqnarray*}
f(x; k, \nu=\infty) = k(k-1) \int_{-\infty}^{\infty} \phi(z)\phi(x + z)
[\Phi(x + z) - \Phi(z)]^{k-2} \,dz
\end{eqnarray*}
Implementation: `scipy.stats.studentized_range`
25 changes: 17 additions & 8 deletions scipy/stats/_continuous_distns.py
Original file line number Diff line number Diff line change
Expand Up @@ -9693,7 +9693,7 @@ class studentized_range_gen(rv_continuous):
When :math:`\nu` exceeds 100,000, an asymptotic approximation (infinite
degrees of freedom) is used to compute the cumulative distribution
function [4]_.
function [4]_ and probability distribution function.
%(after_notes)s
Expand Down Expand Up @@ -9810,18 +9810,25 @@ def _single_moment(K, k, df):
return np.float64(ufunc(K, k, df))

def _pdf(self, x, k, df):
cython_symbol = '_studentized_range_pdf'

def _single_pdf(q, k, df):
log_const = _stats._studentized_range_pdf_logconst(k, df)
arg = [q, k, df, log_const]
usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
# The infinite form of the PDF is derived from the infinite
# CDF.
if df < 100000:
cython_symbol = '_studentized_range_pdf'
log_const = _stats._studentized_range_pdf_logconst(k, df)
arg = [q, k, df, log_const]
usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
ranges = [(-np.inf, np.inf), (0, np.inf)]

llc = LowLevelCallable.from_cython(_stats, cython_symbol, usr_data)
else:
cython_symbol = '_studentized_range_pdf_asymptotic'
arg = [q, k]
usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
ranges = [(-np.inf, np.inf)]

ranges = [(-np.inf, np.inf), (0, np.inf)]
llc = LowLevelCallable.from_cython(_stats, cython_symbol, usr_data)
opts = dict(epsabs=1e-11, epsrel=1e-12)

return integrate.nquad(llc, ranges=ranges, opts=opts)[0]

ufunc = np.frompyfunc(_single_pdf, 3, 1)
Expand All @@ -9840,6 +9847,7 @@ def _single_cdf(q, k, df):
arg = [q, k, df, log_const]
usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
ranges = [(-np.inf, np.inf), (0, np.inf)]

else:
cython_symbol = '_studentized_range_cdf_asymptotic'
arg = [q, k]
Expand All @@ -9851,6 +9859,7 @@ def _single_cdf(q, k, df):
return integrate.nquad(llc, ranges=ranges, opts=opts)[0]

ufunc = np.frompyfunc(_single_cdf, 3, 1)

# clip p-values to ensure they are in [0, 1].
return np.clip(np.float64(ufunc(x, k, df)), 0, 1)

Expand Down
1 change: 1 addition & 0 deletions scipy/stats/_stats.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ cdef double _geninvgauss_pdf(double x, void *user_data) nogil except *
cdef double _studentized_range_cdf(int n, double[2] x, void *user_data) nogil
cdef double _studentized_range_cdf_asymptotic(double z, void *user_data) nogil
cdef double _studentized_range_pdf(int n, double[2] x, void *user_data) nogil
cdef double _studentized_range_pdf_asymptotic(double z, void *user_data) nogil
cdef double _studentized_range_moment(int n, double[3] x_arg, void *user_data) nogil
cdef double _genhyperbolic_pdf(double x, void *user_data) nogil except *
cdef double _genhyperbolic_logpdf(double x, void *user_data) nogil except *
9 changes: 9 additions & 0 deletions scipy/stats/_stats.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,15 @@ cdef double _studentized_range_pdf(int n, double[2] integration_var,
return math.exp(log_terms) * math.pow(_Phi(s * q + z) - _Phi(z), k - 2)


cdef double _studentized_range_pdf_asymptotic(double z, void *user_data) nogil:
# evaluates the integrand of equation (2) by Lund, Lund, page 205. [4]
# destined to be used in a LowLevelCallable
q = (<double *> user_data)[0]
k = (<double *> user_data)[1]

return k * (k - 1) * _phi(z) * _phi(z + q) * math.pow(_Phi(z + q) - _Phi(z), k - 2)


cdef double _studentized_range_moment(int n, double[3] integration_var,
void *user_data) nogil:
# destined to be used in a LowLevelCallable
Expand Down
48 changes: 45 additions & 3 deletions scipy/stats/tests/test_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5502,7 +5502,11 @@ def test_moment_against_mp(self, case_result):
src_case = case_result["src_case"]
mp_result = case_result["mp_result"]
mkv = src_case["m"], src_case["k"], src_case["v"]
res = stats.studentized_range.moment(*mkv)

# Silence invalid value encountered warnings. Actual problems will be
# caught by the result comparison.
with np.errstate(invalid='ignore'):
res = stats.studentized_range.moment(*mkv)

assert_allclose(res, mp_result,
atol=src_case["expected_atol"],
Expand Down Expand Up @@ -5534,15 +5538,21 @@ def test_pdf_against_cdf(self):
def test_cdf_against_r(self, r_case_result):
# Test large `v` values using R
q, k, v, r_res = r_case_result
res = stats.studentized_range.cdf(q, k, v)
with np.errstate(invalid='ignore'):
res = stats.studentized_range.cdf(q, k, v)
assert_allclose(res, r_res)

@pytest.mark.slow
@pytest.mark.xfail_on_32bit("intermittent RuntimeWarning: invalid value.")
def test_moment_vectorization(self):
# Test moment broadcasting. Calls `_munp` directly because
# `rv_continuous.moment` is broken at time of writing. See gh-12192
m = stats.studentized_range._munp([1, 2], [4, 5], [10, 11])

# Silence invalid value encountered warnings. Actual problems will be
# caught by the result comparison.
with np.errstate(invalid='ignore'):
m = stats.studentized_range._munp([1, 2], [4, 5], [10, 11])

assert_allclose(m.shape, (2,))

with pytest.raises(ValueError, match="...could not be broadcast..."):
Expand All @@ -5556,6 +5566,38 @@ def test_fitstart_valid(self):
k, df, _, _ = stats.studentized_range._fitstart([1, 2, 3])
assert_(stats.studentized_range._argcheck(k, df))

def test_infinite_df(self):
# Check that the CDF and PDF infinite and normal integrators
# roughly match for a high df case
res = stats.studentized_range.pdf(3, 10, np.inf)
res_finite = stats.studentized_range.pdf(3, 10, 99999)
assert_allclose(res, res_finite, atol=1e-4, rtol=1e-4)

res = stats.studentized_range.cdf(3, 10, np.inf)
res_finite = stats.studentized_range.cdf(3, 10, 99999)
assert_allclose(res, res_finite, atol=1e-4, rtol=1e-4)

def test_df_cutoff(self):
# Test that the CDF and PDF properly switch integrators at df=100,000.
# The infinite integrator should be different enough that it fails
# an allclose assertion. Also sanity check that using the same
# integrator does pass the allclose with a 1-df difference, which
# should be tiny.

res = stats.studentized_range.pdf(3, 10, 100000)
res_finite = stats.studentized_range.pdf(3, 10, 99999)
res_sanity = stats.studentized_range.pdf(3, 10, 99998)
assert_raises(AssertionError, assert_allclose, res, res_finite,
atol=1e-6, rtol=1e-6)
assert_allclose(res_finite, res_sanity, atol=1e-6, rtol=1e-6)

res = stats.studentized_range.cdf(3, 10, 100000)
res_finite = stats.studentized_range.cdf(3, 10, 99999)
res_sanity = stats.studentized_range.cdf(3, 10, 99998)
assert_raises(AssertionError, assert_allclose, res, res_finite,
atol=1e-6, rtol=1e-6)
assert_allclose(res_finite, res_sanity, atol=1e-6, rtol=1e-6)

def test_clipping(self):
# The result of this computation was -9.9253938401489e-14 on some
# systems. The correct result is very nearly zero, but should not be
Expand Down

0 comments on commit 7f941db

Please sign in to comment.