pytorch · ZelboK · Apr 27, 2024 · Apr 28, 2024 · Apr 28, 2024 · Apr 28, 2024
@@ -117,6 +117,8 @@
 #include <ATen/ops/triu.h>
 #include <ATen/ops/vdot.h>
 #include <ATen/ops/zeros.h>
+#include <ATen/ops/matmul.h>
+#include <ATen/ops/narrow.h>
 #endif
 
 // First the required LAPACK implementations are registered here.
@@ -1556,7 +1558,7 @@ void _linalg_check_errors(
           ": The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: ", info, ").");
     } else if (api_name.find("lstsq") != api_name.npos) {
       TORCH_CHECK_LINALG(false, api_name, batch_str,
-          ": The least squares solution could not be computed because the input matrix does not have full rank (error code: ", info, ").");
+          ": The least squares solution could not be computed because the input matrix does not have full rank (error code: ", info, "). Specify SVD in the driver if you would like to do this.");
     } else if (api_name.find("lu_factor") != api_name.npos) {
       TORCH_CHECK(false, api_name, batch_str,
           ": U[", info, ",", info, "] is zero and using it on lu_solve would result in a division by zero. "
@@ -3427,8 +3429,21 @@ static void linalg_lstsq_out_info(
   auto input_working_copy = copyBatchedColumnMajor(input);
 
   // now the actual call that computes the result in-place (apply_lstsq)
-  lstsq_stub(input.device().type(), input_working_copy, solution, rank, singular_values, infos, rcond, driver);
-
+  if (driver == "gelss" input.device() != at::kCPU) {
+    auto [U, S, Vh] = at::_linalg_svd(input, false, true, "gesvd");
+    auto S_pinv = S.reciprocal();
+    auto s1 = at::narrow(S, /*dim=*/-1, /*start=*/0, /*length=*/1);  // singular values are sorted in descending order
+    S_pinv.masked_fill_(S < rcond * s1, 0);
+    auto uhOther = at::matmul(U.adjoint(), other);
+    if(S_pinv.dim() != uhOther.dim()) {
+      S_pinv = S_pinv.unsqueeze(-1);
+    }
+    auto S_pinv_other = S_pinv * uhOther;
+    solution = at::matmul(Vh.adjoint(), S_pinv_other);
+  }
+  else {
+    lstsq_stub(input.device().type(), input_working_copy, solution, rank, singular_values, infos, rcond, driver);
+  }
   // residuals are available only if m > n and drivers other than gelsy used
   if (m > n && driver != "gelsy") {
     // if the driver is gelss or gelsd then the residuals are available only if rank == n
@@ -3490,8 +3505,8 @@ static std::string get_default_lstsq_driver(c10::optional<c10::string_view> driv
       );
     } else { // else if (input.is_cuda())
       TORCH_CHECK(
-        driver_str == "gels",
-        "torch.linalg.lstsq: `driver` other than `gels` is not supported on CUDA"
+        (driver_str == "gelss" || driver_str == "gels"),
+        "torch.linalg.lstsq: `driver` other than `gels` or `gelss` is not supported on CUDA"
       );
     }
   } else {

@@ -445,7 +445,15 @@ def complement_device(device):
         b = torch.rand(2, 2, 2, dtype=dtype, device=device)
 
         if device != 'cpu':
-            with self.assertRaisesRegex(RuntimeError, '`driver` other than `gels` is not supported on CUDA'):
+            try:
+                result = torch.linalg.lstsq(a, b, driver='gelss')
+                self.assertTrue(result is not None)
+            except Exception as e:
+                self.fail(f"Unexpected error occurred: {e}")
+            with self.assertRaisesRegex(
+                RuntimeError,
+                'torch.linalg.lstsq: `driver` other than `gels` or `gelss` is not supported on CUDA'
+            ):
                 torch.linalg.lstsq(a, b, driver='fictitious_driver')
         # if on cpu
         else:

diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
@@ -1032,7 +1032,7 @@
   - `'gelsd'` (tridiagonal reduction and SVD)
   - But if you run into memory issues: `'gelss'` (full SVD).
 
-For CUDA input, the only valid driver is `'gels'`, which assumes that :attr:`A` is full-rank.
+For CUDA inputs, two drivers are available: 'gels' and 'gelss'.
 
 See also the `full description of these drivers`_
 
@@ -1080,7 +1080,7 @@
 
 Keyword args:
     driver (str, optional): name of the LAPACK/MAGMA method to be used.
-        If `None`, `'gelsy'` is used for CPU inputs and `'gels'` for CUDA inputs.
+        If `None`, `'gelsy'` is used for CPU inputs, `'gels'` and `'gelss'` for CUDA inputs.
         Default: `None`.
 
 Returns: