Skip to content

Commit

Permalink
ENH: stats.pareto fit improvement for parameter combinations (#15567)
Browse files Browse the repository at this point in the history
Co-authored-by: Pamphile Roy <roy.pamphile@gmail.com>
  • Loading branch information
swallan and tupui committed Mar 14, 2022
1 parent 7de016a commit 0ed66c7
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 33 deletions.
87 changes: 77 additions & 10 deletions scipy/stats/_continuous_distns.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from ._constants import (_XMIN, _EULER, _ZETA3,
_SQRT_2_OVER_PI, _LOG_SQRT_2_OVER_PI)
import scipy.stats._boost as _boost
from scipy.optimize import root_scalar


def _remove_optimizer_parameters(kwds):
Expand Down Expand Up @@ -6240,20 +6241,86 @@ def _entropy(self, c):
def fit(self, data, *args, **kwds):
parameters = _check_fit_input_parameters(self, data, args, kwds)
data, fshape, floc, fscale = parameters
if floc is None:
return super().fit(data, **kwds)
if np.any(data - floc < (fscale if fscale else 0)):

# ensure that any fixed parameters don't violate constraints of the
# distribution before continuing.
if floc is not None and np.min(data) - floc < (fscale or 0):
raise FitDataError("pareto", lower=1, upper=np.inf)
data = data - floc

ndata = data.shape[0]

def get_shape(scale, location):
# The first-order necessary condition on `shape` can be solved in
# closed form
return ndata / np.sum(np.log((data - location) / scale))

if floc is fscale is None:
# The support of the distribution is `(x - loc)/scale > 0`.
# The method of Lagrange multipliers turns this constraint
# into an equation that can be solved numerically.
# See gh-12545 for details.

def dL_dScale(shape, scale):
# The partial derivative of the log-likelihood function w.r.t.
# the scale.
return ndata * shape / scale

def dL_dLocation(shape, location):
# The partial derivative of the log-likelihood function w.r.t.
# the location.
return (shape + 1) * np.sum(1 / (data - location))

def fun_to_solve(scale):
# optimize the scale by setting the partial derivatives
# w.r.t. to location and scale equal and solving.
location = np.min(data) - scale
shape = fshape or get_shape(scale, location)
return dL_dLocation(shape, location) - dL_dScale(shape, scale)

def interval_contains_root(lbrack, rbrack):
# return true if the signs disagree.
return (np.sign(fun_to_solve(lbrack)) !=
np.sign(fun_to_solve(rbrack)))

# set brackets for `root_scalar` to use when optimizing over the
# scale such that a root is likely between them. Use user supplied
# guess or default 1.
brack_start = kwds.get('scale', 1)
lbrack, rbrack = brack_start / 2, brack_start * 2
# if a root is not between the brackets, iteratively expand them
# until they include a sign change, checking after each bracket is
# modified.
while (not interval_contains_root(lbrack, rbrack)
and (lbrack > 0 or rbrack < np.inf)):
lbrack /= 2
rbrack *= 2
res = root_scalar(fun_to_solve, bracket=[lbrack, rbrack])
if res.converged:
scale = res.root
loc = np.min(data) - scale
shape = fshape or get_shape(scale, loc)

# The Pareto distribution requires that its parameters satisfy
# the condition `fscale + floc <= min(data)`. However, to
# avoid numerical issues, we require that `fscale + floc`
# is strictly less than `min(data)`. If this condition
# is not satisfied, reduce the scale with `np.nextafter` to
# ensure that data does not fall outside of the support.
if not (scale + loc) < np.min(data):
scale = np.min(data) - loc
scale = np.nextafter(scale, 0)
return shape, loc, scale
else:
return super().fit(data, **kwds)
elif floc is None:
loc = np.min(data) - fscale
else:
loc = floc
# Source: Evans, Hastings, and Peacock (2000), Statistical
# Distributions, 3rd. Ed., John Wiley and Sons. Page 149.

if fscale is None:
fscale = np.min(data)
if fshape is None:
fshape = 1/((1/len(data)) * np.sum(np.log(data/fscale)))
return fshape, floc, fscale
scale = fscale or np.min(data) - loc
shape = fshape or get_shape(scale, loc)
return shape, loc, scale


pareto = pareto_gen(a=1.0, name="pareto")
Expand Down
71 changes: 48 additions & 23 deletions scipy/stats/tests/test_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,14 +1443,18 @@ def test_sf(self):
expected = (scale/x)**b # 2.25e-18
assert_allclose(p, expected)

@pytest.fixture(scope='function')
def rng(self):
return np.random.default_rng(1234)

@pytest.mark.filterwarnings("ignore:invalid value encountered in "
"double_scalars")
@pytest.mark.parametrize("rvs_shape", [1, 2])
@pytest.mark.parametrize("rvs_loc", [0, 2])
@pytest.mark.parametrize("rvs_scale", [1, 5])
def test_fit(self, rvs_shape, rvs_loc, rvs_scale):
def test_fit(self, rvs_shape, rvs_loc, rvs_scale, rng):
data = stats.pareto.rvs(size=100, b=rvs_shape, scale=rvs_scale,
loc=rvs_loc)
loc=rvs_loc, random_state=rng)

# shape can still be fixed with multiple names
shape_mle_analytical1 = stats.pareto.fit(data, floc=0, f0=1.04)[0]
Expand All @@ -1461,41 +1465,53 @@ def test_fit(self, rvs_shape, rvs_loc, rvs_scale):

# data can be shifted with changes to `loc`
data = stats.pareto.rvs(size=100, b=rvs_shape, scale=rvs_scale,
loc=(rvs_loc + 2))
loc=(rvs_loc + 2), random_state=rng)
shape_mle_a, loc_mle_a, scale_mle_a = stats.pareto.fit(data, floc=2)
assert_equal(scale_mle_a + 2, data.min())
assert_equal(shape_mle_a, 1/((1/len(data - 2)) *
np.sum(np.log((data
- 2)/(data.min() - 2)))))

data_shift = data - 2
ndata = data_shift.shape[0]
assert_equal(shape_mle_a,
ndata / np.sum(np.log(data_shift/data_shift.min())))
assert_equal(loc_mle_a, 2)

@pytest.mark.filterwarnings("ignore:invalid value encountered in "
"double_scalars")
@pytest.mark.parametrize("rvs_shape", [1, 2])
@pytest.mark.parametrize("rvs_shape", [.1, 2])
@pytest.mark.parametrize("rvs_loc", [0, 2])
@pytest.mark.parametrize("rvs_scale", [1, 5])
def test_fit_MLE_comp_optimzer(self, rvs_shape, rvs_loc, rvs_scale):
@pytest.mark.parametrize('fix_shape, fix_loc, fix_scale',
[p for p in product([True, False], repeat=3)
if False in p])
def test_fit_MLE_comp_optimzer(self, rvs_shape, rvs_loc, rvs_scale,
fix_shape, fix_loc, fix_scale, rng):
data = stats.pareto.rvs(size=100, b=rvs_shape, scale=rvs_scale,
loc=rvs_loc)
loc=rvs_loc, random_state=rng)
args = [data, (stats.pareto._fitstart(data), )]
func = stats.pareto._reduce_func(args, {})[1]

# fixed `floc` to actual location provides a better fit than the
# super method
_assert_less_or_close_loglike(stats.pareto, data, func, floc=rvs_loc)

# fixing `floc` to an arbitrary number, 0, still provides a better
# fit than the super method
_assert_less_or_close_loglike(stats.pareto, data, func, floc=0)
kwds = {}
if fix_shape:
kwds['f0'] = rvs_shape
if fix_loc:
kwds['floc'] = rvs_loc
if fix_scale:
kwds['fscale'] = rvs_scale

# fixed shape still uses MLE formula and provides a better fit than
# the super method
_assert_less_or_close_loglike(stats.pareto, data, func, floc=0, f0=4)
_assert_less_or_close_loglike(stats.pareto, data, func, **kwds)

# valid fixed fscale still uses MLE formulas and provides a better
# fit than the super method
_assert_less_or_close_loglike(stats.pareto, data, func, floc=0,
fscale=rvs_scale/2)
@pytest.mark.filterwarnings("ignore:invalid value encountered in "
"double_scalars")
def test_fit_known_bad_seed(self):
# Tests a known seed and set of parameters that would produce a result
# would violate the support of Pareto if the fit method did not check
# the constraint `fscale + floc < min(data)`.
shape, location, scale = 1, 0, 1
data = stats.pareto.rvs(shape, location, scale, size=100,
random_state=np.random.default_rng(2535619))
args = [data, (stats.pareto._fitstart(data), )]
func = stats.pareto._reduce_func(args, {})[1]
_assert_less_or_close_loglike(stats.pareto, data, func)

def test_fit_warnings(self):
assert_fit_warnings(stats.pareto)
Expand All @@ -1505,6 +1521,15 @@ def test_fit_warnings(self):
assert_raises(FitDataError, stats.pareto.fit, [5, 2, 3], floc=1,
fscale=3)

def test_negative_data(self, rng):
data = stats.pareto.rvs(loc=-130, b=1, size=100, random_state=rng)
assert_array_less(data, 0)
# The purpose of this test is to make sure that no runtime warnings are
# raised for all negative data, not the output of the fit method. Other
# methods test the output but have to silence warnings from the super
# method.
_ = stats.pareto.fit(data)


class TestGenpareto:
def test_ab(self):
Expand Down

0 comments on commit 0ed66c7

Please sign in to comment.