ENH: stats.gaussian_kde: replace use of inv_cov in pdf (#16692)

* Remove more uses of inv() and replace with cho_solve() * ENH: stats.gaussian_kde: try to use cholesky * Still not working? * ENH: stats.gaussian_kde: finish replacing use of inv_cov in pdf * Apply suggestions from code review * Apply suggestions from code review [skip azp] [skip circleci] * MAINT: stats.kde: move dtype assurance * Update scipy/stats/_kde.py Co-authored-by: Damon McDougall <damon.mcdougall@gmail.com>
scipy · Aug 26, 2022 · 00315a5 · 00315a5
1 parent 2dc7a02
commit 00315a5
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 34 deletions.
diff --git a/scipy/stats/_kde.py b/scipy/stats/_kde.py
@@ -266,8 +266,11 @@ def evaluate(self, points):
         else:
             raise TypeError('%s has unexpected item size %d' %
                             (output_dtype, itemsize))
-        result = gaussian_kernel_estimate[spec](self.dataset.T, self.weights[:, None],
-                                                points.T, self.inv_cov, output_dtype)
+
+        result = gaussian_kernel_estimate[spec](
+            self.dataset.T, self.weights[:, None],
+            points.T, self.cho_cov, output_dtype)
+
         return result[:, 0]
 
     __call__ = evaluate
@@ -574,17 +577,30 @@ def _compute_covariance(self):
         covariance_factor().
         """
         self.factor = self.covariance_factor()
-        # Cache covariance and inverse covariance of the data
-        if not hasattr(self, '_data_inv_cov'):
+        # Cache covariance and permuted cholesky decomp of permuted covariance
+        if not hasattr(self, '_data_cho_cov'):
             self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
                                                bias=False,
                                                aweights=self.weights))
-            self._data_inv_cov = linalg.inv(self._data_covariance)
+            self._data_cho_cov = linalg.cholesky(
+                self._data_covariance[::-1, ::-1]).T[::-1, ::-1]
 
         self.covariance = self._data_covariance * self.factor**2
-        self.inv_cov = (self._data_inv_cov / self.factor**2).astype(np.float64)
-        L = linalg.cholesky(self.covariance*2*pi)
-        self.log_det = 2*np.log(np.diag(L)).sum()
+        self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
+        self.log_det = 2*np.log(np.diag(self.cho_cov
+                                        * np.sqrt(2*pi))).sum()
+
+    @property
+    def inv_cov(self):
+        # Re-compute from scratch each time because I'm not sure how this is
+        # used in the wild. (Perhaps users change the `dataset`, since it's
+        # not a private attribute?) `_compute_covariance` used to recalculate
+        # all these, so we'll recalculate everything now that this is a
+        # a property.
+        self.factor = self.covariance_factor()
+        self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
+                                           bias=False, aweights=self.weights))
+        return linalg.inv(self._data_covariance) / self.factor**2
 
     def pdf(self, x):
         """

diff --git a/scipy/stats/_stats.pyx b/scipy/stats/_stats.pyx
@@ -9,6 +9,7 @@ from numpy cimport ndarray, int64_t, float64_t, intp_t
 import warnings
 import numpy as np
 import scipy.stats, scipy.special
+from scipy.linalg import solve_triangular
 cimport scipy.special.cython_special as cs
 
 np.import_array()
@@ -706,10 +707,9 @@ ctypedef fused real:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def gaussian_kernel_estimate(points, values, xi, precision, dtype, real _=0):
+def gaussian_kernel_estimate(points, values, xi, cho_cov, dtype,
+                             real _=0):
     """
-    def gaussian_kernel_estimate(points, real[:, :] values, xi, precision)
-
     Evaluate a multivariate Gaussian kernel estimate.
 
     Parameters
@@ -720,16 +720,16 @@ def gaussian_kernel_estimate(points, values, xi, precision, dtype, real _=0):
         Multivariate values associated with the data points.
     xi : array_like with shape (m, d)
         Coordinates to evaluate the estimate at in d dimensions.
-    precision : array_like with shape (d, d)
-        Precision matrix for the Gaussian kernel.
+    cho_cov : array_like with shape (d, d)
+        Permuted Cholesky factor of permuted covariance of the data.
 
     Returns
     -------
     estimate : double[:, :] with shape (m, p)
         Multivariate Gaussian kernel estimate evaluated at the input coordinates.
     """
     cdef:
-        real[:, :] points_, xi_, values_, estimate, whitening
+        real[:, :] points_, xi_, values_, estimate
         int i, j, k
         int n, d, m, p
         real arg, residual, norm
@@ -741,19 +741,21 @@ def gaussian_kernel_estimate(points, values, xi, precision, dtype, real _=0):
 
     if xi.shape[1] != d:
         raise ValueError("points and xi must have same trailing dim")
-    if precision.shape[0] != d or precision.shape[1] != d:
-        raise ValueError("precision matrix must match data dims")
+    if cho_cov.shape[0] != d or cho_cov.shape[1] != d:
+        raise ValueError("Covariance matrix must match data dims")
 
     # Rescale the data
-    whitening = np.linalg.cholesky(precision).astype(dtype, copy=False)
-    points_ = np.dot(points, whitening).astype(dtype, copy=False)
-    xi_ = np.dot(xi, whitening).astype(dtype, copy=False)
+    cho_cov = cho_cov.astype(dtype, copy=False)
+    points_ = np.asarray(solve_triangular(cho_cov, points.T, lower=False).T,
+                         dtype=dtype)
+    xi_ = np.asarray(solve_triangular(cho_cov, xi.T, lower=False).T,
+                     dtype=dtype)
     values_ = values.astype(dtype, copy=False)
 
     # Evaluate the normalisation
     norm = math.pow((2 * PI) ,(- d / 2))
     for i in range(d):
-        norm *= whitening[i, i]
+        norm /= cho_cov[i, i]
 
     # Create the result array and evaluate the weighted sum
     estimate = np.zeros((m, p), dtype)

diff --git a/scipy/stats/tests/test_kdeoth.py b/scipy/stats/tests/test_kdeoth.py
@@ -218,16 +218,6 @@ def __init__(self, dataset):
         super().__init__(dataset)
 
 
-class _kde_subclass3(stats.gaussian_kde):
-    def __init__(self, dataset, covariance):
-        self.covariance = covariance
-        stats.gaussian_kde.__init__(self, dataset)
-
-    def _compute_covariance(self):
-        self.inv_cov = np.linalg.inv(self.covariance)
-        self._norm_factor = np.sqrt(np.linalg.det(2 * np.pi * self.covariance))
-
-
 class _kde_subclass4(stats.gaussian_kde):
     def covariance_factor(self):
         return 0.5 * self.silverman_factor()
@@ -251,10 +241,8 @@ def test_gaussian_kde_subclassing():
     y2 = kde2(xs)
     assert_array_almost_equal_nulp(ys, y2, nulp=10)
 
-    # subclass 3
-    kde3 = _kde_subclass3(x1, kde.covariance)
-    y3 = kde3(xs)
-    assert_array_almost_equal_nulp(ys, y3, nulp=10)
+    # subclass 3 was removed because we have no obligation to maintain support
+    # for user invocation of private methods
 
     # subclass 4
     kde4 = _kde_subclass4(x1)