ENH: Infinite df approximation for Studentized Range PDF (#15444)

* ENH: add infinite PDF integrator to Studentized Range Distribution The studentized range distribution's PDF integrator has trouble with high df inputs. This adds another, simpler, integrator to approximate those extreme cases. It is derived from the previously added CDF approximation. * DOC: update Studentized Range doc with infinite PDF * TST: Infinite PDF and CDF tests switching and (approximate) accuracy. * DOC: fix typos with pdf approximation equation * STY: fix comment line length * TST: ignore warnings when testing moments This attempts to fix flaky moment tests that fail with an invalid value encountered warning. If the output is truly incorrect, it will be caught by result comparison. * Ignore np warnings in more stud. range tests Co-authored-by: Dominic C <dominicchm@gmail.com>
scipy · Jun 7, 2022 · 7f941db · 7f941db
1 parent 56039d9
commit 7f941db
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 12 deletions.
diff --git a/doc/source/tutorial/stats/continuous_studentized_range.rst b/doc/source/tutorial/stats/continuous_studentized_range.rst
@@ -24,7 +24,7 @@ This distribution has two shape parameters, :math:`k>1` and :math:`\nu>0`, and t
 
 Note: :math:`\phi(z)` and :math:`\Phi(z)` represent the normal PDF and normal CDF, respectively.
 
-When :math:`\nu` exceeds 100,000, the asymptopic approximation of :math:`F(x; k, \nu=\infty)` is used:
+When :math:`\nu` exceeds 100,000, the asymptotic approximation of :math:`F(x; k, \nu=\infty)` or :math:`f(x; k, \nu=\infty)` is used:
 
 .. math::
    :nowrap:
@@ -34,5 +34,13 @@ When :math:`\nu` exceeds 100,000, the asymptopic approximation of :math:`F(x; k,
     [\Phi(x + z) - \Phi(z)]^{k-1} \,dz
     \end{eqnarray*}
 
+.. math::
+   :nowrap:
+
+    \begin{eqnarray*}
+    f(x; k, \nu=\infty) = k(k-1) \int_{-\infty}^{\infty} \phi(z)\phi(x + z)
+    [\Phi(x + z) - \Phi(z)]^{k-2} \,dz
+    \end{eqnarray*}
+
 
 Implementation: `scipy.stats.studentized_range`
diff --git a/scipy/stats/_continuous_distns.py b/scipy/stats/_continuous_distns.py
@@ -9693,7 +9693,7 @@ class studentized_range_gen(rv_continuous):
 
     When :math:`\nu` exceeds 100,000, an asymptotic approximation (infinite
     degrees of freedom) is used to compute the cumulative distribution
-    function [4]_.
+    function [4]_ and probability distribution function.
 
     %(after_notes)s
 
@@ -9810,18 +9810,25 @@ def _single_moment(K, k, df):
         return np.float64(ufunc(K, k, df))
 
     def _pdf(self, x, k, df):
-        cython_symbol = '_studentized_range_pdf'
 
         def _single_pdf(q, k, df):
-            log_const = _stats._studentized_range_pdf_logconst(k, df)
-            arg = [q, k, df, log_const]
-            usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+            # The infinite form of the PDF is derived from the infinite
+            # CDF.
+            if df < 100000:
+                cython_symbol = '_studentized_range_pdf'
+                log_const = _stats._studentized_range_pdf_logconst(k, df)
+                arg = [q, k, df, log_const]
+                usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+                ranges = [(-np.inf, np.inf), (0, np.inf)]
 
-            llc = LowLevelCallable.from_cython(_stats, cython_symbol, usr_data)
+            else:
+                cython_symbol = '_studentized_range_pdf_asymptotic'
+                arg = [q, k]
+                usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
+                ranges = [(-np.inf, np.inf)]
 
-            ranges = [(-np.inf, np.inf), (0, np.inf)]
+            llc = LowLevelCallable.from_cython(_stats, cython_symbol, usr_data)
             opts = dict(epsabs=1e-11, epsrel=1e-12)
-
             return integrate.nquad(llc, ranges=ranges, opts=opts)[0]
 
         ufunc = np.frompyfunc(_single_pdf, 3, 1)
@@ -9840,6 +9847,7 @@ def _single_cdf(q, k, df):
                 arg = [q, k, df, log_const]
                 usr_data = np.array(arg, float).ctypes.data_as(ctypes.c_void_p)
                 ranges = [(-np.inf, np.inf), (0, np.inf)]
+
             else:
                 cython_symbol = '_studentized_range_cdf_asymptotic'
                 arg = [q, k]
@@ -9851,6 +9859,7 @@ def _single_cdf(q, k, df):
             return integrate.nquad(llc, ranges=ranges, opts=opts)[0]
 
         ufunc = np.frompyfunc(_single_cdf, 3, 1)
+
         # clip p-values to ensure they are in [0, 1].
         return np.clip(np.float64(ufunc(x, k, df)), 0, 1)
 

diff --git a/scipy/stats/_stats.pxd b/scipy/stats/_stats.pxd
@@ -3,6 +3,7 @@ cdef double _geninvgauss_pdf(double x, void *user_data) nogil except *
 cdef double _studentized_range_cdf(int n, double[2] x, void *user_data) nogil
 cdef double _studentized_range_cdf_asymptotic(double z, void *user_data) nogil
 cdef double _studentized_range_pdf(int n, double[2] x, void *user_data) nogil
+cdef double _studentized_range_pdf_asymptotic(double z, void *user_data) nogil
 cdef double _studentized_range_moment(int n, double[3] x_arg, void *user_data) nogil
 cdef double _genhyperbolic_pdf(double x, void *user_data) nogil except *
 cdef double _genhyperbolic_logpdf(double x, void *user_data) nogil except *
diff --git a/scipy/stats/_stats.pyx b/scipy/stats/_stats.pyx
@@ -602,6 +602,15 @@ cdef double _studentized_range_pdf(int n, double[2] integration_var,
     return math.exp(log_terms) * math.pow(_Phi(s * q + z) - _Phi(z), k - 2)
 
 
+cdef double _studentized_range_pdf_asymptotic(double z, void *user_data) nogil:
+    # evaluates the integrand of equation (2) by Lund, Lund, page 205. [4]
+    # destined to be used in a LowLevelCallable
+    q = (<double *> user_data)[0]
+    k = (<double *> user_data)[1]
+
+    return k * (k - 1) * _phi(z) * _phi(z + q) * math.pow(_Phi(z + q) - _Phi(z), k - 2)
+
+
 cdef double _studentized_range_moment(int n, double[3] integration_var,
                                       void *user_data) nogil:
     # destined to be used in a LowLevelCallable

diff --git a/scipy/stats/tests/test_distributions.py b/scipy/stats/tests/test_distributions.py
@@ -5502,7 +5502,11 @@ def test_moment_against_mp(self, case_result):
         src_case = case_result["src_case"]
         mp_result = case_result["mp_result"]
         mkv = src_case["m"], src_case["k"], src_case["v"]
-        res = stats.studentized_range.moment(*mkv)
+
+        # Silence invalid value encountered warnings. Actual problems will be
+        # caught by the result comparison.
+        with np.errstate(invalid='ignore'):
+            res = stats.studentized_range.moment(*mkv)
 
         assert_allclose(res, mp_result,
                         atol=src_case["expected_atol"],
@@ -5534,15 +5538,21 @@ def test_pdf_against_cdf(self):
     def test_cdf_against_r(self, r_case_result):
         # Test large `v` values using R
         q, k, v, r_res = r_case_result
-        res = stats.studentized_range.cdf(q, k, v)
+        with np.errstate(invalid='ignore'):
+            res = stats.studentized_range.cdf(q, k, v)
         assert_allclose(res, r_res)
 
     @pytest.mark.slow
     @pytest.mark.xfail_on_32bit("intermittent RuntimeWarning: invalid value.")
     def test_moment_vectorization(self):
         # Test moment broadcasting. Calls `_munp` directly because
         # `rv_continuous.moment` is broken at time of writing. See gh-12192
-        m = stats.studentized_range._munp([1, 2], [4, 5], [10, 11])
+
+        # Silence invalid value encountered warnings. Actual problems will be
+        # caught by the result comparison.
+        with np.errstate(invalid='ignore'):
+            m = stats.studentized_range._munp([1, 2], [4, 5], [10, 11])
+
         assert_allclose(m.shape, (2,))
 
         with pytest.raises(ValueError, match="...could not be broadcast..."):
@@ -5556,6 +5566,38 @@ def test_fitstart_valid(self):
             k, df, _, _ = stats.studentized_range._fitstart([1, 2, 3])
         assert_(stats.studentized_range._argcheck(k, df))
 
+    def test_infinite_df(self):
+        # Check that the CDF and PDF infinite and normal integrators
+        # roughly match for a high df case
+        res = stats.studentized_range.pdf(3, 10, np.inf)
+        res_finite = stats.studentized_range.pdf(3, 10, 99999)
+        assert_allclose(res, res_finite, atol=1e-4, rtol=1e-4)
+
+        res = stats.studentized_range.cdf(3, 10, np.inf)
+        res_finite = stats.studentized_range.cdf(3, 10, 99999)
+        assert_allclose(res, res_finite, atol=1e-4, rtol=1e-4)
+
+    def test_df_cutoff(self):
+        # Test that the CDF and PDF properly switch integrators at df=100,000.
+        # The infinite integrator should be different enough that it fails
+        # an allclose assertion. Also sanity check that using the same
+        # integrator does pass the allclose with a 1-df difference, which
+        # should be tiny.
+
+        res = stats.studentized_range.pdf(3, 10, 100000)
+        res_finite = stats.studentized_range.pdf(3, 10, 99999)
+        res_sanity = stats.studentized_range.pdf(3, 10, 99998)
+        assert_raises(AssertionError, assert_allclose, res, res_finite,
+                      atol=1e-6, rtol=1e-6)
+        assert_allclose(res_finite, res_sanity, atol=1e-6, rtol=1e-6)
+
+        res = stats.studentized_range.cdf(3, 10, 100000)
+        res_finite = stats.studentized_range.cdf(3, 10, 99999)
+        res_sanity = stats.studentized_range.cdf(3, 10, 99998)
+        assert_raises(AssertionError, assert_allclose, res, res_finite,
+                      atol=1e-6, rtol=1e-6)
+        assert_allclose(res_finite, res_sanity, atol=1e-6, rtol=1e-6)
+
     def test_clipping(self):
         # The result of this computation was -9.9253938401489e-14 on some
         # systems. The correct result is very nearly zero, but should not be