ENH: stats.pareto fit improvement for parameter combinations (#15567)

Co-authored-by: Pamphile Roy <roy.pamphile@gmail.com>
scipy · Mar 14, 2022 · 0ed66c7 · 0ed66c7
1 parent 7de016a
commit 0ed66c7
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 33 deletions.
diff --git a/scipy/stats/_continuous_distns.py b/scipy/stats/_continuous_distns.py
@@ -30,6 +30,7 @@
 from ._constants import (_XMIN, _EULER, _ZETA3,
                          _SQRT_2_OVER_PI, _LOG_SQRT_2_OVER_PI)
 import scipy.stats._boost as _boost
+from scipy.optimize import root_scalar
 
 
 def _remove_optimizer_parameters(kwds):
@@ -6240,20 +6241,86 @@ def _entropy(self, c):
     def fit(self, data, *args, **kwds):
         parameters = _check_fit_input_parameters(self, data, args, kwds)
         data, fshape, floc, fscale = parameters
-        if floc is None:
-            return super().fit(data, **kwds)
-        if np.any(data - floc < (fscale if fscale else 0)):
+
+        # ensure that any fixed parameters don't violate constraints of the
+        # distribution before continuing.
+        if floc is not None and np.min(data) - floc < (fscale or 0):
             raise FitDataError("pareto", lower=1, upper=np.inf)
-        data = data - floc
 
+        ndata = data.shape[0]
+
+        def get_shape(scale, location):
+            # The first-order necessary condition on `shape` can be solved in
+            # closed form
+            return ndata / np.sum(np.log((data - location) / scale))
+
+        if floc is fscale is None:
+            # The support of the distribution is `(x - loc)/scale > 0`.
+            # The method of Lagrange multipliers turns this constraint
+            # into an equation that can be solved numerically.
+            # See gh-12545 for details.
+
+            def dL_dScale(shape, scale):
+                # The partial derivative of the log-likelihood function w.r.t.
+                # the scale.
+                return ndata * shape / scale
+
+            def dL_dLocation(shape, location):
+                # The partial derivative of the log-likelihood function w.r.t.
+                # the location.
+                return (shape + 1) * np.sum(1 / (data - location))
+
+            def fun_to_solve(scale):
+                # optimize the scale by setting the partial derivatives
+                # w.r.t. to location and scale equal and solving.
+                location = np.min(data) - scale
+                shape = fshape or get_shape(scale, location)
+                return dL_dLocation(shape, location) - dL_dScale(shape, scale)
+
+            def interval_contains_root(lbrack, rbrack):
+                # return true if the signs disagree.
+                return (np.sign(fun_to_solve(lbrack)) !=
+                        np.sign(fun_to_solve(rbrack)))
+
+            # set brackets for `root_scalar` to use when optimizing over the
+            # scale such that a root is likely between them. Use user supplied
+            # guess or default 1.
+            brack_start = kwds.get('scale', 1)
+            lbrack, rbrack = brack_start / 2, brack_start * 2
+            # if a root is not between the brackets, iteratively expand them
+            # until they include a sign change, checking after each bracket is
+            # modified.
+            while (not interval_contains_root(lbrack, rbrack)
+                   and (lbrack > 0 or rbrack < np.inf)):
+                lbrack /= 2
+                rbrack *= 2
+            res = root_scalar(fun_to_solve, bracket=[lbrack, rbrack])
+            if res.converged:
+                scale = res.root
+                loc = np.min(data) - scale
+                shape = fshape or get_shape(scale, loc)
+
+                # The Pareto distribution requires that its parameters satisfy
+                # the condition `fscale + floc <= min(data)`. However, to
+                # avoid numerical issues, we require that `fscale + floc`
+                # is strictly less than `min(data)`. If this condition
+                # is not satisfied, reduce the scale with `np.nextafter` to
+                # ensure that data does not fall outside of the support.
+                if not (scale + loc) < np.min(data):
+                    scale = np.min(data) - loc
+                    scale = np.nextafter(scale, 0)
+                return shape, loc, scale
+            else:
+                return super().fit(data, **kwds)
+        elif floc is None:
+            loc = np.min(data) - fscale
+        else:
+            loc = floc
         # Source: Evans, Hastings, and Peacock (2000), Statistical
         # Distributions, 3rd. Ed., John Wiley and Sons. Page 149.
-
-        if fscale is None:
-            fscale = np.min(data)
-        if fshape is None:
-            fshape = 1/((1/len(data)) * np.sum(np.log(data/fscale)))
-        return fshape, floc, fscale
+        scale = fscale or np.min(data) - loc
+        shape = fshape or get_shape(scale, loc)
+        return shape, loc, scale
 
 
 pareto = pareto_gen(a=1.0, name="pareto")

diff --git a/scipy/stats/tests/test_distributions.py b/scipy/stats/tests/test_distributions.py
@@ -1443,14 +1443,18 @@ def test_sf(self):
         expected = (scale/x)**b   # 2.25e-18
         assert_allclose(p, expected)
 
+    @pytest.fixture(scope='function')
+    def rng(self):
+        return np.random.default_rng(1234)
+
     @pytest.mark.filterwarnings("ignore:invalid value encountered in "
                                 "double_scalars")
     @pytest.mark.parametrize("rvs_shape", [1, 2])
     @pytest.mark.parametrize("rvs_loc", [0, 2])
     @pytest.mark.parametrize("rvs_scale", [1, 5])
-    def test_fit(self, rvs_shape, rvs_loc, rvs_scale):
+    def test_fit(self, rvs_shape, rvs_loc, rvs_scale, rng):
         data = stats.pareto.rvs(size=100, b=rvs_shape, scale=rvs_scale,
-                                loc=rvs_loc)
+                                loc=rvs_loc, random_state=rng)
 
         # shape can still be fixed with multiple names
         shape_mle_analytical1 = stats.pareto.fit(data, floc=0, f0=1.04)[0]
@@ -1461,41 +1465,53 @@ def test_fit(self, rvs_shape, rvs_loc, rvs_scale):
 
         # data can be shifted with changes to `loc`
         data = stats.pareto.rvs(size=100, b=rvs_shape, scale=rvs_scale,
-                                loc=(rvs_loc + 2))
+                                loc=(rvs_loc + 2), random_state=rng)
         shape_mle_a, loc_mle_a, scale_mle_a = stats.pareto.fit(data, floc=2)
         assert_equal(scale_mle_a + 2, data.min())
-        assert_equal(shape_mle_a, 1/((1/len(data - 2)) *
-                                     np.sum(np.log((data
-                                                    - 2)/(data.min() - 2)))))
+
+        data_shift = data - 2
+        ndata = data_shift.shape[0]
+        assert_equal(shape_mle_a,
+                     ndata / np.sum(np.log(data_shift/data_shift.min())))
         assert_equal(loc_mle_a, 2)
 
     @pytest.mark.filterwarnings("ignore:invalid value encountered in "
                                 "double_scalars")
-    @pytest.mark.parametrize("rvs_shape", [1, 2])
+    @pytest.mark.parametrize("rvs_shape", [.1, 2])
     @pytest.mark.parametrize("rvs_loc", [0, 2])
     @pytest.mark.parametrize("rvs_scale", [1, 5])
-    def test_fit_MLE_comp_optimzer(self, rvs_shape, rvs_loc, rvs_scale):
+    @pytest.mark.parametrize('fix_shape, fix_loc, fix_scale',
+                             [p for p in product([True, False], repeat=3)
+                              if False in p])
+    def test_fit_MLE_comp_optimzer(self, rvs_shape, rvs_loc, rvs_scale,
+                                   fix_shape, fix_loc, fix_scale, rng):
         data = stats.pareto.rvs(size=100, b=rvs_shape, scale=rvs_scale,
-                                loc=rvs_loc)
+                                loc=rvs_loc, random_state=rng)
         args = [data, (stats.pareto._fitstart(data), )]
         func = stats.pareto._reduce_func(args, {})[1]
 
-        # fixed `floc` to actual location provides a better fit than the
-        # super method
-        _assert_less_or_close_loglike(stats.pareto, data, func, floc=rvs_loc)
-
-        # fixing `floc` to an arbitrary number, 0, still provides a better
-        # fit than the super method
-        _assert_less_or_close_loglike(stats.pareto, data, func, floc=0)
+        kwds = {}
+        if fix_shape:
+            kwds['f0'] = rvs_shape
+        if fix_loc:
+            kwds['floc'] = rvs_loc
+        if fix_scale:
+            kwds['fscale'] = rvs_scale
 
-        # fixed shape still uses MLE formula and provides a better fit than
-        # the super method
-        _assert_less_or_close_loglike(stats.pareto, data, func, floc=0, f0=4)
+        _assert_less_or_close_loglike(stats.pareto, data, func, **kwds)
 
-        # valid fixed fscale still uses MLE formulas and provides a better
-        # fit than the super method
-        _assert_less_or_close_loglike(stats.pareto, data, func, floc=0,
-                                      fscale=rvs_scale/2)
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in "
+                                "double_scalars")
+    def test_fit_known_bad_seed(self):
+        # Tests a known seed and set of parameters that would produce a result
+        # would violate the support of Pareto if the fit method did not check
+        # the constraint `fscale + floc < min(data)`.
+        shape, location, scale = 1, 0, 1
+        data = stats.pareto.rvs(shape, location, scale, size=100,
+                                random_state=np.random.default_rng(2535619))
+        args = [data, (stats.pareto._fitstart(data), )]
+        func = stats.pareto._reduce_func(args, {})[1]
+        _assert_less_or_close_loglike(stats.pareto, data, func)
 
     def test_fit_warnings(self):
         assert_fit_warnings(stats.pareto)
@@ -1505,6 +1521,15 @@ def test_fit_warnings(self):
         assert_raises(FitDataError, stats.pareto.fit, [5, 2, 3], floc=1,
                       fscale=3)
 
+    def test_negative_data(self, rng):
+        data = stats.pareto.rvs(loc=-130, b=1, size=100, random_state=rng)
+        assert_array_less(data, 0)
+        # The purpose of this test is to make sure that no runtime warnings are
+        # raised for all negative data, not the output of the fit method. Other
+        # methods test the output but have to silence warnings from the super
+        # method.
+        _ = stats.pareto.fit(data)
+
 
 class TestGenpareto:
     def test_ab(self):