Skip to content

Commit

Permalink
Update Chebyshev samplers to match the original paper and code (#894)
Browse files Browse the repository at this point in the history
  • Loading branch information
smastelini committed Mar 28, 2022
1 parent 981e18b commit 06dcac3
Showing 1 changed file with 40 additions and 25 deletions.
65 changes: 40 additions & 25 deletions river/imblearn/chebyshev.py
Expand Up @@ -18,13 +18,17 @@ class ChebyshevUnderSampler(base.Wrapper, base.Regressor):
for an observation $y$ becomes: $P(|y - \\overline{y}|=t) = \\dfrac{\\sigma^2}{|y-\\overline{y}|}$.
The reciprocal of this probability is used for under-sampling[^1] the most frequent cases. Extreme
valued or rare cases have higher probabilities of selection, whereas the most frequent cases are
likely to be discarded.
likely to be discarded. Still, frequent cases have a small chance of being selected (controlled via
the `sp` parameter) in case few rare instances were observed.
Parameters
----------
regressor
The regression model that will receive the biased sample.
sp
Second chance probability. Even if an example is not initially selected for training, it still has
a small chance of being selected in case the number of rare case observed so far is small.
seed
Random seed to support reproducibility.
Expand Down Expand Up @@ -53,9 +57,9 @@ class ChebyshevUnderSampler(base.Wrapper, base.Regressor):
... metrics.MAE(),
... print_every=500
... )
[500] MAE: 1.84619
[1,000] MAE: 1.516441
MAE: 1.515879
[500] MAE: 1.633571
[1,000] MAE: 1.460907
MAE: 1.4604
References
----------
Expand All @@ -64,13 +68,17 @@ class ChebyshevUnderSampler(base.Wrapper, base.Regressor):
"""

def __init__(self, regressor: base.Regressor, seed: int = None):
def __init__(self, regressor: base.Regressor, sp: float = 0.15, seed: int = None):
self.regressor = regressor
self.sp = sp
self.seed = seed

self._var = stats.Var()
self._rng = random.Random(self.seed)

self._freq_c = 0
self._rare_c = 0

@property
def _wrapped_model(self):
return self.regressor
Expand All @@ -79,21 +87,27 @@ def predict_one(self, x):
return self.regressor.predict_one(x)

def learn_one(self, x, y, **kwargs):
var = self._var.get()
sd = var**0.5
self._var.update(y)
sd = self._var.get() ** 0.5

if sd > 0:
mean = self._var.mean.get()
dev = abs(y - mean)
dev = abs(y - mean) # noqa
t = dev / sd
if t > 1:
prob_train = 1 - (var / (dev**2))
p = self._rng.random()

if p < prob_train:
self.regressor.learn_one(x, y, **kwargs)
# Small values for rare cases and 1 for frequent cases
prob_threshold = 1 / (t * t) if t > 1 else 1
p = self._rng.random()

if p >= prob_threshold:
self.regressor.learn_one(x, y, **kwargs)
self._rare_c += 1
elif self._freq_c < self._rare_c and p <= self.sp:
self.regressor.learn_one(x, y, **kwargs)
self._freq_c += 1
else:
self.regressor.learn_one(x, y, **kwargs)

self._var.update(y)
return self

@classmethod
Expand All @@ -116,8 +130,8 @@ class ChebyshevOverSampler(base.Wrapper, base.Regressor):
Alternatively, one can use $t$ directly to estimate a frequency weight $\\kappa = \\lceil t\\rceil$
and define an over-sampling strategy for extreme and rare target values[^1]. Each incoming instance is
used $\\kappa$ times to update the underlying regressor, in case $t > 1$. Otherwise, the instance is
ignored by the wrapped regression model.
used $\\kappa$ times to update the underlying regressor. Frequent target values contribute only once
to the underlying regressor, whereas rares cases are used multiple times for training.
Parameters
Expand Down Expand Up @@ -149,9 +163,9 @@ class ChebyshevOverSampler(base.Wrapper, base.Regressor):
... metrics.MAE(),
... print_every=500
... )
[500] MAE: 2.131883
[1,000] MAE: 1.496747
MAE: 1.496013
[500] MAE: 1.152726
[1,000] MAE: 0.954873
MAE: 0.954049
References
----------
Expand All @@ -173,21 +187,22 @@ def predict_one(self, x):
return self.regressor.predict_one(x)

def learn_one(self, x, y, **kwargs):
self._var.update(y)
var = self._var.get()
sd = var**0.5

if sd > 0:
mean = self._var.mean.get()
dev = abs(y - mean)
dev = abs(y - mean) # noqa
t = dev / sd

if t > 1:
kappa = int(math.ceil(t))
kappa = int(math.ceil(t))

for k in range(kappa):
self.regressor.learn_one(x, y, **kwargs)
for k in range(kappa):
self.regressor.learn_one(x, y, **kwargs)
else:
self.regressor.learn_one(x, y, **kwargs)

self._var.update(y)
return self

@classmethod
Expand Down

0 comments on commit 06dcac3

Please sign in to comment.