stable matrix sqrt using closed-form diff

Samuel Boïté · Samuel Boïté · commit f08e5e178225 · 2025-10-23T18:46:54.000+02:00
diff --git a/ot/backend.py b/ot/backend.py
@@ -1938,6 +1938,7 @@ def __init__(self):
             self.rng_cuda_ = torch.Generator("cpu")
 
         from torch.autograd import Function
+        from torch.autograd.function import once_differentiable
 
         # define a function that takes inputs val and grads
         # ad returns a val tensor with proper gradients
@@ -1951,8 +1952,32 @@ def forward(ctx, val, grads, *inputs):
             def backward(ctx, grad_output):
                 # the gradients are grad
                 return (None, None) + tuple(g * grad_output for g in ctx.grads)
+        
+        # define a differentiable SPD matrix sqrt
+        # with closed-form VJP
+        class MatrixSqrtFunction(Function):
+            @staticmethod
+            def forward(ctx, a):
+                a_sym = .5 * (a + a.transpose(-2, -1))
+                L, V = torch.linalg.eigh(a_sym)
+                s = L.clamp_min(0).sqrt()
+                y = (V * s.unsqueeze(-2)) @ V.transpose(-2, -1)
+                ctx.save_for_backward(s, V)
+                return y
+
+            @staticmethod
+            @once_differentiable
+            def backward(ctx, g):
+                s, V = ctx.saved_tensors
+                g_sym = .5 * (g + g.transpose(-2, -1))
+                ghat = V.transpose(-2, -1) @ g_sym @ V
+                d = s.unsqueeze(-1) + s.unsqueeze(-2)
+                xhat = ghat / d
+                xhat = xhat.masked_fill(d == 0, 0)
+                return V @ xhat @ V.transpose(-2, -1)
 
         self.ValFunction = ValFunction
+        self.MatrixSqrtFunction = MatrixSqrtFunction
 
     def _to_numpy(self, a):
         if isinstance(a, float) or isinstance(a, int) or isinstance(a, np.ndarray):
@@ -2395,12 +2420,7 @@ def pinv(self, a, hermitian=False):
         return torch.linalg.pinv(a, hermitian=hermitian)
 
     def sqrtm(self, a):
-        L, V = torch.linalg.eigh(a)
-        L = torch.sqrt(L)
-        # Q[...] = V[...] @ diag(L[...])
-        Q = torch.einsum("...jk,...k->...jk", V, L)
-        # R[...] = Q[...] @ V[...].T
-        return torch.einsum("...jk,...kl->...jl", Q, torch.transpose(V, -1, -2))
+        return self.MatrixSqrtFunction.apply(a)
 
     def eigh(self, a):
         return torch.linalg.eigh(a)