From da77e25f285087664d85a2a92ee4dca88d63e44b Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 20 Nov 2025 10:48:23 +0100
Subject: [PATCH 01/28] Add CUDA Sparse Solvers

---
 xobjects/SparseSolvers/CUDA/cuDSSLU.py |  81 +++++++
 xobjects/SparseSolvers/CUDA/luLU.py    | 307 +++++++++++++++++++++++++
 2 files changed, 388 insertions(+)
 create mode 100644 xobjects/SparseSolvers/CUDA/cuDSSLU.py
 create mode 100644 xobjects/SparseSolvers/CUDA/luLU.py

diff --git a/xobjects/SparseSolvers/CUDA/cuDSSLU.py b/xobjects/SparseSolvers/CUDA/cuDSSLU.py
new file mode 100644
index 0000000..e15f2ae
--- /dev/null
+++ b/xobjects/SparseSolvers/CUDA/cuDSSLU.py
@@ -0,0 +1,81 @@
+import cupy as cp
+import cupyx.scipy.sparse as sp
+import nvmath
+
+class DirectSolverSuperLU(nvmath.sparse.advanced.DirectSolver):
+    """cuDSS-based direct solver; reuse factors for many RHS (SuperLU-style)."""
+
+    def __init__(self, A, *, n_batches: int = 0,assume_general=True, order_rhs='auto', **kwargs):
+        # 1) Validate A
+        if not isinstance(A, sp.csr_matrix):
+            raise TypeError("A must be cupyx.scipy.sparse.csr_matrix")
+        if A.shape[0] != A.shape[1]:
+            raise ValueError("A must be square")
+        if A.indices.dtype != cp.int32 or A.indptr.dtype != cp.int32:
+            A = sp.csr_matrix((A.data, A.indices.astype(cp.int32), A.indptr.astype(cp.int32)),
+                              shape=A.shape)
+        # 2) Sample RHS to initialize parent
+        if n_batches == 0:
+            b_sample = cp.zeros(A.shape[0], dtype=A.dtype)
+        else:
+            b_sample = cp.zeros((A.shape[0],n_batches), dtype=A.dtype, order = 'F')
+        self.b_dtype = A.dtype
+        super().__init__(A, b_sample, **kwargs)
+
+        # 3) Optional: configure plan
+        pc = self.plan_config
+        if assume_general:
+            # LU path (cuDSS will often infer this, but we can be explicit if available)
+            try:
+                pc.matrix_type = nvmath.sparse.advanced.DirectSolverMatrixType.GENERAL
+            except AttributeError:
+                pass  # older wheels infer type; safe to ignore
+        # Expose a simple RHS layout policy
+        self._order_rhs = order_rhs  # 'F'|'C'|'auto'
+
+        # 4) Build & factorize once; warmup solve to build internal caches
+        super().plan()
+        self.fac_info = super().factorize()
+        # Warmup on zeros (uses parent solve())
+        super().reset_operands(b=b_sample)
+        super().solve()
+
+    def _prepare_rhs(self, b):
+        # Accept 1-D or 2-D; ensure dtype/device and layout
+        if b.dtype != self.b_dtype:
+            raise TypeError(f"RHS dtype {b.dtype} does not match matrix dtype {self.b_dtype}")
+        if b.ndim == 1:
+            return b  # vector is fine
+        if self._order_rhs == 'auto':
+            # cuDSS prefers column-major for (n,k)
+            return cp.array(b, order='F', copy=False)
+        return cp.array(b, order=self._order_rhs, copy=False)
+
+    def solve(self, b):
+        """Solve A x = b using cached factors. Accepts (n,) or (n,k) RHS."""
+        b = self._prepare_rhs(b)
+        super().reset_operands(b=b)
+        return super().solve()
+
+    # Optional helpers for SuperLU-like API
+    def __call__(self, b):  # x = solver(b)
+        return self.solve(b)
+
+    def refactorize_values(self, A_new):
+        """Refactorize when numerical values change but sparsity pattern is the same."""
+        if not isinstance(A_new, sp.csr_matrix):
+            raise TypeError("A_new must be CSR")
+        if A_new.shape != self.operands.a_shape:
+            raise ValueError("Shape mismatch")
+        if A_new.indices.dtype != cp.int32 or A_new.indptr.dtype != cp.int32:
+            A_new = sp.csr_matrix((A_new.data, A_new.indices.astype(cp.int32),
+                                   A_new.indptr.astype(cp.int32)), shape=A_new.shape)
+        super().reset_operands(A=A_new)
+        self.fac_info = self.factorize()
+
+    def close(self):
+        """Explicitly free resources when not using a context manager."""
+        try:
+            self.__exit__(None, None, None)  # DirectSolver is context-manageable
+        except Exception:
+            pass
\ No newline at end of file
diff --git a/xobjects/SparseSolvers/CUDA/luLU.py b/xobjects/SparseSolvers/CUDA/luLU.py
new file mode 100644
index 0000000..6bd4c8a
--- /dev/null
+++ b/xobjects/SparseSolvers/CUDA/luLU.py
@@ -0,0 +1,307 @@
+import cupy as _cupy
+import numpy as _numpy
+import cupyx.scipy.sparse
+from cupyx.scipy.sparse.linalg import SuperLU
+from cupy_backends.cuda.api import driver as _driver
+from cupy_backends.cuda.api import runtime as _runtime
+from cupy_backends.cuda.libs import cusparse as _cusparse
+from cupy._core import _dtype
+from cupy.cuda import device as _device
+from cupy.cuda import stream as _stream
+from cupyx.cusparse import _dtype_to_IndexType, SpMatDescriptor, DnMatDescriptor, check_availability
+import scipy.sparse.linalg
+try:
+    import scipy.sparse.linalg
+    scipy_available = True
+except ImportError:
+    scipy_available = False
+from line_profiler import profile
+
+class CachedAbSolver:
+    
+    def __init__(self, a, b, alpha=1.0, lower=True, unit_diag=False, transa=False):
+        if not check_availability('spsm'):
+            raise RuntimeError('spsm is not available.')
+
+        # Canonicalise transa
+        if transa is False:
+            transa = 'N'
+        elif transa is True:
+            transa = 'T'
+        elif transa not in 'NTH':
+            raise ValueError(f'Unknown transa (actual: {transa})')
+
+        # Check A's type and sparse format
+        if cupyx.scipy.sparse.isspmatrix_csr(a):
+            pass
+        elif cupyx.scipy.sparse.isspmatrix_csc(a):
+            if transa == 'N':
+                a = a.T
+                transa = 'T'
+            elif transa == 'T':
+                a = a.T
+                transa = 'N'
+            elif transa == 'H':
+                a = a.conj().T
+                transa = 'N'
+            lower = not lower
+        elif cupyx.scipy.sparse.isspmatrix_coo(a):
+            pass
+        else:
+            raise ValueError('a must be CSR, CSC or COO sparse matrix')
+        assert a.has_canonical_format
+
+        # Check B's ndim
+        if b.ndim == 1:
+            is_b_vector = True
+            b = b.reshape(-1, 1)
+        elif b.ndim == 2:
+            is_b_vector = False
+        else:
+            raise ValueError('b.ndim must be 1 or 2')
+        self.is_b_vector = is_b_vector
+
+        # Check shapes
+        if not (a.shape[0] == a.shape[1] == b.shape[0]):
+            raise ValueError('mismatched shape')
+
+        # Check dtypes
+        dtype = a.dtype
+        if dtype.char not in 'fdFD':
+            raise TypeError('Invalid dtype (actual: {})'.format(dtype))
+        if dtype != b.dtype:
+            raise TypeError('dtype mismatch')
+        self.dtype = dtype
+
+        # Prepare fill mode
+        if lower is True:
+            fill_mode = _cusparse.CUSPARSE_FILL_MODE_LOWER
+        elif lower is False:
+            fill_mode = _cusparse.CUSPARSE_FILL_MODE_UPPER
+        else:
+            raise ValueError('Unknown lower (actual: {})'.format(lower))
+        self.fill_mode = fill_mode
+
+        # Prepare diag type
+        if unit_diag is False:
+            diag_type = _cusparse.CUSPARSE_DIAG_TYPE_NON_UNIT
+        elif unit_diag is True:
+            diag_type = _cusparse.CUSPARSE_DIAG_TYPE_UNIT
+        else:
+            raise ValueError('Unknown unit_diag (actual: {})'.format(unit_diag))
+        self.diag_type = diag_type
+        self.transa = transa
+        # Prepare op_a
+        if transa == 'N':
+            op_a = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
+        elif transa == 'T':
+            op_a = _cusparse.CUSPARSE_OPERATION_TRANSPOSE
+        else:  # transa == 'H'
+            if dtype.char in 'fd':
+                op_a = _cusparse.CUSPARSE_OPERATION_TRANSPOSE
+            else:
+                op_a = _cusparse.CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
+        self.op_a = op_a
+        # Prepare op_b
+        self.op_b = self._get_opb(b)
+        
+        # Allocate space for matrix C. Note that it is known cusparseSpSM requires
+        # the output matrix zero initialized.
+        m, _ = a.shape
+        if self.op_b == _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE:
+            _, n = b.shape
+        else:
+            n, _ = b.shape
+        c_shape = m, n
+        self.c_shape = c_shape
+
+        self._perform_analysis(a, b, alpha=alpha)
+
+    def _perform_analysis(self, a, b, alpha=1.0):
+        """Solves a sparse triangular linear system op(a) * x = alpha * op(b).
+
+        Args:
+            a (cupyx.scipy.sparse.csr_matrix or cupyx.scipy.sparse.coo_matrix):
+                Sparse matrix with dimension ``(M, M)``.
+            b (cupy.ndarray): Dense matrix with dimension ``(M, K)``.
+            alpha (float or complex): Coefficient.
+            lower (bool):
+                True: ``a`` is lower triangle matrix.
+                False: ``a`` is upper triangle matrix.
+            unit_diag (bool):
+                True: diagonal part of ``a`` has unit elements.
+                False: diagonal part of ``a`` has non-unit elements.
+            transa (bool or str): True, False, 'N', 'T' or 'H'.
+                'N' or False: op(a) == ``a``.
+                'T' or True: op(a) == ``a.T``.
+                'H': op(a) == ``a.conj().T``.
+        """
+        
+        c = _cupy.zeros(self.c_shape, dtype=a.dtype, order='f')
+
+        # Prepare descriptors and other parameters
+        self.handle = _device.get_cusparse_handle()
+        self.mat_a = SpMatDescriptor.create(a)
+        mat_b = DnMatDescriptor.create(b)
+        mat_c = DnMatDescriptor.create(c)
+        self.spsm_descr = _cusparse.spSM_createDescr()
+        self.alpha = _numpy.array(alpha, dtype=c.dtype).ctypes
+        self.cuda_dtype = _dtype.to_cuda_dtype(c.dtype)
+        self.algo = _cusparse.CUSPARSE_SPSM_ALG_DEFAULT
+
+        try:
+            # Specify Lower|Upper fill mode
+            self.mat_a.set_attribute(_cusparse.CUSPARSE_SPMAT_FILL_MODE, self.fill_mode)
+
+            # Specify Unit|Non-Unit diagonal type
+            self.mat_a.set_attribute(_cusparse.CUSPARSE_SPMAT_DIAG_TYPE, self.diag_type)
+
+            # Allocate the workspace needed by the succeeding phases
+            buff_size = _cusparse.spSM_bufferSize(
+                self.handle, self.op_a, self.op_b, self.alpha.data, self.mat_a.desc, mat_b.desc,
+                mat_c.desc, self.cuda_dtype, self.algo, self.spsm_descr)
+            self.buff = _cupy.empty(buff_size, dtype=_cupy.int8)
+
+            # Perform the analysis phase
+            _cusparse.spSM_analysis(
+                self.handle, self.op_a, self.op_b, self.alpha.data, self.mat_a.desc, mat_b.desc,
+                mat_c.desc, self.cuda_dtype, self.algo, self.spsm_descr, self.buff.data.ptr)
+        except Exception as e:
+            raise RuntimeError('spSM_analysis failed.') from e
+    
+    def _get_opb(self, b):
+        # Prepare op_b
+        if b._f_contiguous:
+            op_b = _cusparse.CUSPARSE_OPERATION_NON_TRANSPOSE
+        elif b._c_contiguous:
+            if _cusparse.get_build_version() < 11701:  # earlier than CUDA 11.6
+                raise ValueError('b must be F-contiguous.')
+            b = b.T
+            op_b = _cusparse.CUSPARSE_OPERATION_TRANSPOSE
+        else:
+            raise ValueError('b must be F-contiguous or C-contiguous.')
+        return op_b
+    @profile
+    def solve(self, b):
+        assert b.dtype == self.dtype
+        assert self._get_opb(b) == self.op_b
+        
+        if b.ndim == 1:
+            is_b_vector = True
+            b = b.reshape(-1, 1)
+        elif b.ndim == 2:
+            is_b_vector = False
+        else:
+            raise ValueError('b.ndim must be 1 or 2')
+        self.is_b_vector = is_b_vector
+        
+        c = _cupy.zeros(self.c_shape, dtype=self.dtype, order='f')
+        mat_b = DnMatDescriptor.create(b)
+        mat_c = DnMatDescriptor.create(c)
+        try:
+            # Executes the solve phase
+            _cusparse.spSM_solve(
+                self.handle, self.op_a, self.op_b, self.alpha.data, self.mat_a.desc, mat_b.desc,
+                mat_c.desc, self.cuda_dtype, self.algo, self.spsm_descr, self.buff.data.ptr)
+        finally:
+            _cupy.cuda.get_current_stream().synchronize()
+        # Reshape back if B was a vector
+        if self.is_b_vector:
+            c = c.reshape(-1)
+        return c
+
+    # def __del__(self):
+    #     # Destroy matrix descriptor
+    #     print("Deleting solver obj...")
+    #     _cusparse.spSM_destroyDescr(self.spsm_descr)
+
+class luLU(SuperLU):
+
+    def __init__(self, A, trans = 'N', permc_spec=None, n_batches: int = 0, diag_pivot_thresh=None, relax=None,
+         panel_size=None, options={}):
+        if not check_availability('spsm'):
+            raise RuntimeError('spsm is not available.')
+        if not scipy_available:
+            raise RuntimeError('scipy is not available')
+        if not cupyx.scipy.sparse.isspmatrix(A):
+            raise TypeError('A must be cupyx.scipy.sparse.spmatrix')
+        if A.shape[0] != A.shape[1]:
+            raise ValueError('A must be a square matrix (A.shape: {})'
+                            .format(A.shape))
+        if A.dtype.char not in 'fdFD':
+            raise TypeError('Invalid dtype (actual: {})'.format(A.dtype))
+
+        a = A.get().tocsc()
+        a_slu = scipy.sparse.linalg.splu(
+            a, permc_spec=permc_spec, diag_pivot_thresh=diag_pivot_thresh,
+            relax=relax, panel_size=panel_size, options=options)
+        super().__init__(a_slu)
+        self.b_dtype = self.L.dtype
+        self._init_solvers(trans=trans, n_batches = n_batches)
+    
+    def _init_solvers(self,trans = 'N', n_batches = 0):
+        if n_batches == 0:
+            b_sample = _cupy.zeros(self.shape[0], dtype=self.b_dtype)
+        else:
+            b_sample = _cupy.zeros((self.shape[0],n_batches), dtype=self.b_dtype, order = 'F')
+        self.trans = trans
+        self.Lsolver = CachedAbSolver(self.L, b_sample, lower=True, transa=self.trans)
+        self.Usolver = CachedAbSolver(self.U, b_sample, lower=False, transa=self.trans)
+        # self.Usolver = CachedAbSolver(self.U.T, b_sample, lower=True, transa="T") #Can improve performance at times
+    @profile
+    def solve(self, rhs, trans='N'):
+        """Solves linear system of equations with one or several right-hand sides.
+
+        Args:
+            rhs (cupy.ndarray): Right-hand side(s) of equation with dimension
+                ``(M)`` or ``(M, K)``.
+            trans (str): 'N', 'T' or 'H'.
+                'N': Solves ``A * x = rhs``.
+                'T': Solves ``A.T * x = rhs``.
+                'H': Solves ``A.conj().T * x = rhs``.
+
+        Returns:
+            cupy.ndarray:
+                Solution vector(s)
+        """  # NOQA
+        from cupyx import cusparse
+        if trans != self.trans:
+            raise AssertionError("Solve function assumes cached configuration. Rebuild cache by calling _init_solvers with desired configuration.")
+        if not isinstance(rhs, _cupy.ndarray):
+            raise TypeError('ojb must be cupy.ndarray')
+        if rhs.ndim not in (1, 2):
+            raise ValueError('rhs.ndim must be 1 or 2 (actual: {})'.
+                             format(rhs.ndim))
+        if rhs.shape[0] != self.shape[0]:
+            raise ValueError('shape mismatch (self.shape: {}, rhs.shape: {})'
+                             .format(self.shape, rhs.shape))
+        if trans not in ('N', 'T', 'H'):
+            raise ValueError('trans must be \'N\', \'T\', or \'H\'')
+
+        x = rhs.astype(self.L.dtype)
+        if trans == 'N':
+            if self.perm_r is not None:
+                if x.ndim == 2 and x._f_contiguous:
+                    x = x.T[:, self._perm_r_rev].T  # want to keep f-order
+                else:
+                    x = x[self._perm_r_rev]
+            x = self.Lsolver.solve(x)
+            x = self.Usolver.solve(x)
+            if self.perm_c is not None:
+                x = x[self.perm_c]
+        else:
+            if self.perm_c is not None:
+                if x.ndim == 2 and x._f_contiguous:
+                    x = x.T[:, self._perm_c_rev].T  # want to keep f-order
+                else:
+                    x = x[self._perm_c_rev]
+            x = self.Usolver.solve(x)
+            x = self.Lsolver.solve(x)
+            if self.perm_r is not None:
+                x = x[self.perm_r]
+
+        if not x._f_contiguous:
+            # For compatibility with SciPy
+            x = x.copy(order='F')
+        return x
+    
\ No newline at end of file

From c8b0295cf815645280194bf76b7041cd0edfd150 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 20 Nov 2025 12:28:38 +0100
Subject: [PATCH 02/28] Update Cached Sparse Solvers for clarity

---
 xobjects/SparseSolvers/CUDA/cuDSSLU.py | 13 ++++++++++++-
 xobjects/SparseSolvers/CUDA/luLU.py    | 26 +++++++++++++++++++++-----
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/xobjects/SparseSolvers/CUDA/cuDSSLU.py b/xobjects/SparseSolvers/CUDA/cuDSSLU.py
index e15f2ae..54a3504 100644
--- a/xobjects/SparseSolvers/CUDA/cuDSSLU.py
+++ b/xobjects/SparseSolvers/CUDA/cuDSSLU.py
@@ -5,7 +5,11 @@
 class DirectSolverSuperLU(nvmath.sparse.advanced.DirectSolver):
     """cuDSS-based direct solver; reuse factors for many RHS (SuperLU-style)."""
 
-    def __init__(self, A, *, n_batches: int = 0,assume_general=True, order_rhs='auto', **kwargs):
+    def __init__(self, A, *, 
+                 n_batches: int = 0,
+                 assume_general=True, 
+                 order_rhs='auto', 
+                 **kwargs):
         # 1) Validate A
         if not isinstance(A, sp.csr_matrix):
             raise TypeError("A must be cupyx.scipy.sparse.csr_matrix")
@@ -20,6 +24,7 @@ def __init__(self, A, *, n_batches: int = 0,assume_general=True, order_rhs='auto
         else:
             b_sample = cp.zeros((A.shape[0],n_batches), dtype=A.dtype, order = 'F')
         self.b_dtype = A.dtype
+        self.b_shape = b_sample.shape
         super().__init__(A, b_sample, **kwargs)
 
         # 3) Optional: configure plan
@@ -53,6 +58,12 @@ def _prepare_rhs(self, b):
 
     def solve(self, b):
         """Solve A x = b using cached factors. Accepts (n,) or (n,k) RHS."""
+        assert b.shape == self.b_shape, (
+            "Cached solver can only accept RHS with the same shape "
+            f"as the initialized value. {self.b_shape}. "
+            "The initialized RHS shape can be changed by initializing "
+            "using a different value for n_batches"
+        )
         b = self._prepare_rhs(b)
         super().reset_operands(b=b)
         return super().solve()
diff --git a/xobjects/SparseSolvers/CUDA/luLU.py b/xobjects/SparseSolvers/CUDA/luLU.py
index 6bd4c8a..cbe5fb2 100644
--- a/xobjects/SparseSolvers/CUDA/luLU.py
+++ b/xobjects/SparseSolvers/CUDA/luLU.py
@@ -217,8 +217,14 @@ def solve(self, b):
 
 class luLU(SuperLU):
 
-    def __init__(self, A, trans = 'N', permc_spec=None, n_batches: int = 0, diag_pivot_thresh=None, relax=None,
-         panel_size=None, options={}):
+    def __init__(self, A, 
+                 trans = 'N', 
+                 permc_spec=None, 
+                 n_batches: int = 0, 
+                 diag_pivot_thresh=None, 
+                 relax=None,
+                 panel_size=None, 
+                 options={}):
         if not check_availability('spsm'):
             raise RuntimeError('spsm is not available.')
         if not scipy_available:
@@ -241,10 +247,13 @@ def __init__(self, A, trans = 'N', permc_spec=None, n_batches: int = 0, diag_piv
     
     def _init_solvers(self,trans = 'N', n_batches = 0):
         if n_batches == 0:
-            b_sample = _cupy.zeros(self.shape[0], dtype=self.b_dtype)
+            b_sample = _cupy.zeros(self.shape[0], 
+                                   dtype=self.b_dtype)
         else:
-            b_sample = _cupy.zeros((self.shape[0],n_batches), dtype=self.b_dtype, order = 'F')
+            b_sample = _cupy.zeros((self.shape[0],n_batches), 
+                                   dtype=self.b_dtype, order = 'F')
         self.trans = trans
+        self.b_shape = b_sample.shape
         self.Lsolver = CachedAbSolver(self.L, b_sample, lower=True, transa=self.trans)
         self.Usolver = CachedAbSolver(self.U, b_sample, lower=False, transa=self.trans)
         # self.Usolver = CachedAbSolver(self.U.T, b_sample, lower=True, transa="T") #Can improve performance at times
@@ -266,7 +275,8 @@ def solve(self, rhs, trans='N'):
         """  # NOQA
         from cupyx import cusparse
         if trans != self.trans:
-            raise AssertionError("Solve function assumes cached configuration. Rebuild cache by calling _init_solvers with desired configuration.")
+            raise AssertionError("Solve function assumes cached configuration. " \
+            "Rebuild cache by calling _init_solvers with desired configuration.")
         if not isinstance(rhs, _cupy.ndarray):
             raise TypeError('ojb must be cupy.ndarray')
         if rhs.ndim not in (1, 2):
@@ -275,6 +285,12 @@ def solve(self, rhs, trans='N'):
         if rhs.shape[0] != self.shape[0]:
             raise ValueError('shape mismatch (self.shape: {}, rhs.shape: {})'
                              .format(self.shape, rhs.shape))
+        assert rhs.shape == self.b_shape, (
+            "Cached solver can only accept RHS with the same shape "
+            f"as the initialized value. {self.b_shape}. "
+            "The initialized RHS shape can be changed by initializing "
+            "using a different value for n_batches"
+        )
         if trans not in ('N', 'T', 'H'):
             raise ValueError('trans must be \'N\', \'T\', or \'H\'')
 

From 2152ecd22df8d0e2f1c2c0608a0196fb5067e806 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 20 Nov 2025 12:29:54 +0100
Subject: [PATCH 03/28] Add context-aware Sparse Solver functionality

---
 xobjects/SparseSolvers/abstract_solver.py | 11 +++
 xobjects/__init__.py                      |  1 +
 xobjects/sparse.py                        | 99 +++++++++++++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 xobjects/SparseSolvers/abstract_solver.py
 create mode 100644 xobjects/sparse.py

diff --git a/xobjects/SparseSolvers/abstract_solver.py b/xobjects/SparseSolvers/abstract_solver.py
new file mode 100644
index 0000000..8a6022c
--- /dev/null
+++ b/xobjects/SparseSolvers/abstract_solver.py
@@ -0,0 +1,11 @@
+from abc import ABC, abstractmethod
+
+class SuperLUlikeSolver(ABC):
+
+    @abstractmethod
+    def __init__(self, A):
+        pass
+
+    @abstractmethod
+    def solve(self,b):
+        return x
\ No newline at end of file
diff --git a/xobjects/__init__.py b/xobjects/__init__.py
index 0d9f709..8f81cec 100644
--- a/xobjects/__init__.py
+++ b/xobjects/__init__.py
@@ -19,6 +19,7 @@
 from .string import String
 from .struct import Struct, Field
 from .ref import Ref, UnionRef
+from .sparse import factorized_sparse_solver
 
 from .context_cpu import ContextCpu
 from .context_pyopencl import ContextPyopencl
diff --git a/xobjects/sparse.py b/xobjects/sparse.py
new file mode 100644
index 0000000..541578d
--- /dev/null
+++ b/xobjects/sparse.py
@@ -0,0 +1,99 @@
+import scipy.sparse
+from typing import Optional, Literal, Union
+from .context import XContext
+from .context_cpu import ContextCpu
+from .context_cupy import ContextCupy
+from .context_pyopencl import ContextPyopencl
+from .SparseSolvers.abstract_solver import SuperLUlikeSolver
+try:
+    import cupyx.scipy.sparse
+    from cupyx import cusparse
+except:
+    pass
+
+def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix, 
+                                      scipy.sparse.csc_matrix], 
+                             n_batches: int = 0,
+                             force_solver: Optional[
+                                            Literal["scipySLU", 
+                                                    "PyKLU",
+                                                    "cuDSS", 
+                                                    "CachedSLU", 
+                                                    "cupySLU"]
+                                                    ] = None,
+                             solverKwargs: dict = None,
+                             context: Union[
+                                      Literal["cpu",
+                                              "cupy",
+                                              "pyopencl"],
+                                      XContext
+                                            ] = "cpu"
+                            ) -> SuperLUlikeSolver:
+    if solverKwargs is None:
+        solverKwargs = {}
+    if context == "cpu" or isinstance(context, ContextCpu):
+        if 'permc_spec' not in solverKwargs:
+            solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
+        if force_solver is None or force_solver == "scipySLU":
+            if A.shape[0]*n_batches < 10**5:
+                import warnings
+                warnings.warn("For small matrices, using PyKLU " 
+                              "can provide improved performance")
+            solver = scipy.sparse.linalg.splu(A.tocsc(),**solverKwargs)
+        elif force_solver == "PyKLU":
+            import PyKLU.klu as PyKLU
+            solver = PyKLU.Klu(A.tocsc())
+        else:
+            raise ValueError("Unrecognized CPU Sparse solver. Available options: "
+                             "scipySLU, PyKLU")
+        
+    elif context == "cupy" or isinstance(context, ContextCupy):
+
+        assert isinstance(A ,cupyx.scipy.sparse.csr_matrix), (
+                "When using ContextCupy, input must be "
+                "cupyx.scipy.sparse.csr_matrix")
+
+        if force_solver is not None and force_solver != "cuDSS":
+            if 'permc_spec' not in solverKwargs:
+                solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
+        if force_solver is None:
+            import warnings
+            try:
+                from .SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
+                solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
+            except (ModuleNotFoundError, RuntimeError) as e:
+                warnings.warn("cuDSS not available. " 
+                              "Falling back to Cached-SuperLU (spsm) "
+                              f"Encountered Error: {e}")
+                if 'permc_spec' not in solverKwargs:
+                    solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
+                try:
+                    if cusparse.check_availability('csrsm2'):
+                        raise RuntimeError("csrsm2 is avaiable. "
+                                           "cupy SuperLU performs better "
+                                           "than Cached-SuperLU (spsm)")
+                    from .SparseSolvers.CUDA.luLU import luLU
+                    solver = luLU(A, n_batches = n_batches, **solverKwargs)
+                except RuntimeError as e:
+                    warnings.warn("Cached-SuperLU (spsm) solver failed. " 
+                                  "Falling back to cupy SuperLU. "
+                                  f"Error encountered: {e}")
+                    solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)
+        elif force_solver == "cuDSS":
+            from .SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
+            solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
+        elif force_solver == "CachedSLU":
+            from .SparseSolvers.CUDA.luLU import luLU
+            solver = luLU(A, n_batches = n_batches, **solverKwargs)
+        elif force_solver == "cupySLU":
+            solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)
+        else:
+            raise ValueError("Unrecognized CUDA Sparse solver. Available options: "
+                             "cuDSS, CachedSLU, cupySLU")
+    elif context == "pyopencl" or isinstance(context, ContextPyopencl):
+        raise NotImplementedError("No sparse solver is currently available "
+                                  "for PyOpenCL context")
+    else:
+        raise ValueError("Invalid context. Available contexts are: " \
+                         "cpu, cupy, pyopencl")
+    return solver
\ No newline at end of file

From 2d54741e3917b229d9a33fbd6430e79aec2c34d6 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Fri, 21 Nov 2025 10:23:38 +0100
Subject: [PATCH 04/28] Refactor sparse module

---
 xobjects/sparse/__init__.py     |  1 +
 xobjects/{ => sparse}/sparse.py | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)
 create mode 100644 xobjects/sparse/__init__.py
 rename xobjects/{ => sparse}/sparse.py (90%)

diff --git a/xobjects/sparse/__init__.py b/xobjects/sparse/__init__.py
new file mode 100644
index 0000000..5f88bc6
--- /dev/null
+++ b/xobjects/sparse/__init__.py
@@ -0,0 +1 @@
+from .sparse import factorized_sparse_solver
\ No newline at end of file
diff --git a/xobjects/sparse.py b/xobjects/sparse/sparse.py
similarity index 90%
rename from xobjects/sparse.py
rename to xobjects/sparse/sparse.py
index 541578d..a7ba025 100644
--- a/xobjects/sparse.py
+++ b/xobjects/sparse/sparse.py
@@ -1,10 +1,10 @@
 import scipy.sparse
 from typing import Optional, Literal, Union
-from .context import XContext
-from .context_cpu import ContextCpu
-from .context_cupy import ContextCupy
-from .context_pyopencl import ContextPyopencl
-from .SparseSolvers.abstract_solver import SuperLUlikeSolver
+from ..context import XContext
+from ..context_cpu import ContextCpu
+from ..context_cupy import ContextCupy
+from ..context_pyopencl import ContextPyopencl
+from ..SparseSolvers.abstract_solver import SuperLUlikeSolver
 try:
     import cupyx.scipy.sparse
     from cupyx import cusparse
@@ -59,7 +59,7 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
         if force_solver is None:
             import warnings
             try:
-                from .SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
+                from ..SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
                 solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
             except (ModuleNotFoundError, RuntimeError) as e:
                 warnings.warn("cuDSS not available. " 
@@ -72,7 +72,7 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                         raise RuntimeError("csrsm2 is avaiable. "
                                            "cupy SuperLU performs better "
                                            "than Cached-SuperLU (spsm)")
-                    from .SparseSolvers.CUDA.luLU import luLU
+                    from ..SparseSolvers.CUDA.luLU import luLU
                     solver = luLU(A, n_batches = n_batches, **solverKwargs)
                 except RuntimeError as e:
                     warnings.warn("Cached-SuperLU (spsm) solver failed. " 
@@ -80,10 +80,10 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                                   f"Error encountered: {e}")
                     solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)
         elif force_solver == "cuDSS":
-            from .SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
+            from ..SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
             solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
         elif force_solver == "CachedSLU":
-            from .SparseSolvers.CUDA.luLU import luLU
+            from ..SparseSolvers.CUDA.luLU import luLU
             solver = luLU(A, n_batches = n_batches, **solverKwargs)
         elif force_solver == "cupySLU":
             solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)

From 0d0ef4f77a75865cb2efbe3b47b878c1e27a7a0f Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Fri, 21 Nov 2025 11:29:49 +0100
Subject: [PATCH 05/28] Package sparse solvers in a nice interface

---
 xobjects/__init__.py                                |  2 +-
 xobjects/sparse/__init__.py                         |  4 +++-
 xobjects/sparse/solvers/CPU/__init__.py             |  2 ++
 xobjects/sparse/solvers/CUDA/__init__.py            | 13 +++++++++++++
 .../cuDSSLU.py => sparse/solvers/CUDA/_cuDSSLU.py}  |  0
 .../CUDA/luLU.py => sparse/solvers/CUDA/_luLU.py}   |  0
 xobjects/sparse/solvers/__init__.py                 |  7 +++++++
 .../solvers/_abstract_solver.py}                    |  0
 xobjects/sparse/sparse.py                           | 10 +++++-----
 9 files changed, 31 insertions(+), 7 deletions(-)
 create mode 100644 xobjects/sparse/solvers/CPU/__init__.py
 create mode 100644 xobjects/sparse/solvers/CUDA/__init__.py
 rename xobjects/{SparseSolvers/CUDA/cuDSSLU.py => sparse/solvers/CUDA/_cuDSSLU.py} (100%)
 rename xobjects/{SparseSolvers/CUDA/luLU.py => sparse/solvers/CUDA/_luLU.py} (100%)
 create mode 100644 xobjects/sparse/solvers/__init__.py
 rename xobjects/{SparseSolvers/abstract_solver.py => sparse/solvers/_abstract_solver.py} (100%)

diff --git a/xobjects/__init__.py b/xobjects/__init__.py
index 8f81cec..8c2f5a7 100644
--- a/xobjects/__init__.py
+++ b/xobjects/__init__.py
@@ -19,7 +19,7 @@
 from .string import String
 from .struct import Struct, Field
 from .ref import Ref, UnionRef
-from .sparse import factorized_sparse_solver
+from . import sparse
 
 from .context_cpu import ContextCpu
 from .context_pyopencl import ContextPyopencl
diff --git a/xobjects/sparse/__init__.py b/xobjects/sparse/__init__.py
index 5f88bc6..b10286c 100644
--- a/xobjects/sparse/__init__.py
+++ b/xobjects/sparse/__init__.py
@@ -1 +1,3 @@
-from .sparse import factorized_sparse_solver
\ No newline at end of file
+from .sparse import factorized_sparse_solver
+from . import solvers
+__all__ = ["factorized_sparse_solver","solvers"]
diff --git a/xobjects/sparse/solvers/CPU/__init__.py b/xobjects/sparse/solvers/CPU/__init__.py
new file mode 100644
index 0000000..3c30baf
--- /dev/null
+++ b/xobjects/sparse/solvers/CPU/__init__.py
@@ -0,0 +1,2 @@
+from scipy.sparse.linalg import splu as scipySuperLU
+__all__ = ["scipySuperLU"]
\ No newline at end of file
diff --git a/xobjects/sparse/solvers/CUDA/__init__.py b/xobjects/sparse/solvers/CUDA/__init__.py
new file mode 100644
index 0000000..20f7537
--- /dev/null
+++ b/xobjects/sparse/solvers/CUDA/__init__.py
@@ -0,0 +1,13 @@
+__all__ = []
+try:
+    from ._cuDSSLU import DirectSolverSuperLU as cuDSSSuperLU
+    __all__.append("cuDSSSuperLU")
+except:
+    pass
+try:
+    from ._luLU import luLU as CachedSuperLU
+    __all__.append("CachedSuperLU")
+    from cupyx.scipy.sparse.linalg import splu as CupySuperLU
+    __all__.append("CupySuperLU")
+except:
+    pass
\ No newline at end of file
diff --git a/xobjects/SparseSolvers/CUDA/cuDSSLU.py b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
similarity index 100%
rename from xobjects/SparseSolvers/CUDA/cuDSSLU.py
rename to xobjects/sparse/solvers/CUDA/_cuDSSLU.py
diff --git a/xobjects/SparseSolvers/CUDA/luLU.py b/xobjects/sparse/solvers/CUDA/_luLU.py
similarity index 100%
rename from xobjects/SparseSolvers/CUDA/luLU.py
rename to xobjects/sparse/solvers/CUDA/_luLU.py
diff --git a/xobjects/sparse/solvers/__init__.py b/xobjects/sparse/solvers/__init__.py
new file mode 100644
index 0000000..c2201f6
--- /dev/null
+++ b/xobjects/sparse/solvers/__init__.py
@@ -0,0 +1,7 @@
+from . import CPU
+__all__ = ["CPU"]
+try:
+    from . import CUDA
+    __all__.append("CUDA")
+except:
+    pass
diff --git a/xobjects/SparseSolvers/abstract_solver.py b/xobjects/sparse/solvers/_abstract_solver.py
similarity index 100%
rename from xobjects/SparseSolvers/abstract_solver.py
rename to xobjects/sparse/solvers/_abstract_solver.py
diff --git a/xobjects/sparse/sparse.py b/xobjects/sparse/sparse.py
index a7ba025..fe3310b 100644
--- a/xobjects/sparse/sparse.py
+++ b/xobjects/sparse/sparse.py
@@ -4,7 +4,7 @@
 from ..context_cpu import ContextCpu
 from ..context_cupy import ContextCupy
 from ..context_pyopencl import ContextPyopencl
-from ..SparseSolvers.abstract_solver import SuperLUlikeSolver
+from .solvers._abstract_solver import SuperLUlikeSolver
 try:
     import cupyx.scipy.sparse
     from cupyx import cusparse
@@ -59,7 +59,7 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
         if force_solver is None:
             import warnings
             try:
-                from ..SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
+                from .solvers.CUDA._cuDSSLU import DirectSolverSuperLU
                 solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
             except (ModuleNotFoundError, RuntimeError) as e:
                 warnings.warn("cuDSS not available. " 
@@ -72,7 +72,7 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                         raise RuntimeError("csrsm2 is avaiable. "
                                            "cupy SuperLU performs better "
                                            "than Cached-SuperLU (spsm)")
-                    from ..SparseSolvers.CUDA.luLU import luLU
+                    from .solvers.CUDA._luLU import luLU
                     solver = luLU(A, n_batches = n_batches, **solverKwargs)
                 except RuntimeError as e:
                     warnings.warn("Cached-SuperLU (spsm) solver failed. " 
@@ -80,10 +80,10 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                                   f"Error encountered: {e}")
                     solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)
         elif force_solver == "cuDSS":
-            from ..SparseSolvers.CUDA.cuDSSLU import DirectSolverSuperLU
+            from .solvers.CUDA._cuDSSLU import DirectSolverSuperLU
             solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
         elif force_solver == "CachedSLU":
-            from ..SparseSolvers.CUDA.luLU import luLU
+            from .solvers.CUDA._luLU import luLU
             solver = luLU(A, n_batches = n_batches, **solverKwargs)
         elif force_solver == "cupySLU":
             solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)

From 92f825855e1842c1d99c3b826e93795f45ae9e04 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Fri, 21 Nov 2025 11:49:54 +0100
Subject: [PATCH 06/28] Add exception for unavailable cupy in
 factorized_sparse_solver

---
 xobjects/sparse/sparse.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/xobjects/sparse/sparse.py b/xobjects/sparse/sparse.py
index fe3310b..aef1801 100644
--- a/xobjects/sparse/sparse.py
+++ b/xobjects/sparse/sparse.py
@@ -8,7 +8,9 @@
 try:
     import cupyx.scipy.sparse
     from cupyx import cusparse
+    _cupy_available = True
 except:
+    _cupy_available = False
     pass
 
 def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix, 
@@ -48,7 +50,9 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                              "scipySLU, PyKLU")
         
     elif context == "cupy" or isinstance(context, ContextCupy):
-
+        if not _cupy_available:
+            raise ModuleNotFoundError("No cupy module found. " \
+                                      "ContextCupy unavailable")
         assert isinstance(A ,cupyx.scipy.sparse.csr_matrix), (
                 "When using ContextCupy, input must be "
                 "cupyx.scipy.sparse.csr_matrix")

From 1b5d6c1f29b2409c338749465025b536f74faff3 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Mon, 24 Nov 2025 11:28:21 +0100
Subject: [PATCH 07/28] Make cuDSS solver more robust

---
 xobjects/sparse/solvers/CUDA/_cuDSSLU.py | 32 ++++++++++++++----------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/xobjects/sparse/solvers/CUDA/_cuDSSLU.py b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
index 54a3504..018ddc2 100644
--- a/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
+++ b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
@@ -1,14 +1,15 @@
 import cupy as cp
 import cupyx.scipy.sparse as sp
 import nvmath
+from typing import Literal, Union
+import warnings
 
 class DirectSolverSuperLU(nvmath.sparse.advanced.DirectSolver):
     """cuDSS-based direct solver; reuse factors for many RHS (SuperLU-style)."""
 
-    def __init__(self, A, *, 
+    def __init__(self, A: sp.csr_matrix, *, 
                  n_batches: int = 0,
-                 assume_general=True, 
-                 order_rhs='auto', 
+                 assume_general: bool = True, 
                  **kwargs):
         # 1) Validate A
         if not isinstance(A, sp.csr_matrix):
@@ -16,13 +17,16 @@ def __init__(self, A, *,
         if A.shape[0] != A.shape[1]:
             raise ValueError("A must be square")
         if A.indices.dtype != cp.int32 or A.indptr.dtype != cp.int32:
-            A = sp.csr_matrix((A.data, A.indices.astype(cp.int32), A.indptr.astype(cp.int32)),
+            A = sp.csr_matrix((A.data, 
+                               A.indices.astype(cp.int32), 
+                               A.indptr.astype(cp.int32)),
                               shape=A.shape)
         # 2) Sample RHS to initialize parent
         if n_batches == 0:
             b_sample = cp.zeros(A.shape[0], dtype=A.dtype)
         else:
-            b_sample = cp.zeros((A.shape[0],n_batches), dtype=A.dtype, order = 'F')
+            b_sample = cp.zeros((A.shape[0],n_batches), 
+                                dtype=A.dtype, order = 'F')
         self.b_dtype = A.dtype
         self.b_shape = b_sample.shape
         super().__init__(A, b_sample, **kwargs)
@@ -35,8 +39,6 @@ def __init__(self, A, *,
                 pc.matrix_type = nvmath.sparse.advanced.DirectSolverMatrixType.GENERAL
             except AttributeError:
                 pass  # older wheels infer type; safe to ignore
-        # Expose a simple RHS layout policy
-        self._order_rhs = order_rhs  # 'F'|'C'|'auto'
 
         # 4) Build & factorize once; warmup solve to build internal caches
         super().plan()
@@ -47,15 +49,19 @@ def __init__(self, A, *,
 
     def _prepare_rhs(self, b):
         # Accept 1-D or 2-D; ensure dtype/device and layout
+        if not isinstance(b, cp.ndarray):
+            raise TypeError('b must be cupy.ndarray')
         if b.dtype != self.b_dtype:
-            raise TypeError(f"RHS dtype {b.dtype} does not match matrix dtype {self.b_dtype}")
+            raise TypeError(f"RHS dtype {b.dtype} does not match "
+                            f"matrix dtype {self.b_dtype}")
         if b.ndim == 1:
             return b  # vector is fine
-        if self._order_rhs == 'auto':
-            # cuDSS prefers column-major for (n,k)
-            return cp.array(b, order='F', copy=False)
-        return cp.array(b, order=self._order_rhs, copy=False)
-
+        if not b.flags.f_contiguous:
+            warnings.warn("cuDSS expects a column-major ('F' ordered) " \
+                          "matrix for the RHS. RHS was autmatically converted "
+                          "by the solver.")
+        return cp.asfortranarray(b)
+       
     def solve(self, b):
         """Solve A x = b using cached factors. Accepts (n,) or (n,k) RHS."""
         assert b.shape == self.b_shape, (

From 6a853de6f12ed93657f98750a09044dd6d24af16 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Mon, 24 Nov 2025 11:29:05 +0100
Subject: [PATCH 08/28] Allow factorized_sparse_solver function to infer
 context from input

---
 xobjects/sparse/sparse.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/xobjects/sparse/sparse.py b/xobjects/sparse/sparse.py
index aef1801..c4cf5ea 100644
--- a/xobjects/sparse/sparse.py
+++ b/xobjects/sparse/sparse.py
@@ -1,4 +1,5 @@
 import scipy.sparse
+from numpy import ndarray as nparray
 from typing import Optional, Literal, Union
 from ..context import XContext
 from ..context_cpu import ContextCpu
@@ -6,6 +7,7 @@
 from ..context_pyopencl import ContextPyopencl
 from .solvers._abstract_solver import SuperLUlikeSolver
 try:
+    from cupy import ndarray as cparray
     import cupyx.scipy.sparse
     from cupyx import cusparse
     _cupy_available = True
@@ -29,11 +31,23 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                                               "cupy",
                                               "pyopencl"],
                                       XContext
-                                            ] = "cpu"
+                                            ] = None
                             ) -> SuperLUlikeSolver:
     if solverKwargs is None:
         solverKwargs = {}
+    if context is None:
+        if isinstance(A, scipy.sparse.spmatrix) or isinstance(A, nparray):
+            context = 'cpu'
+        elif (_cupy_available and 
+              (isinstance(A, cupyx.scipy.sparse.spmatrix) 
+               or isinstance(A, cparray))):
+            context = 'cupy'
+        else:
+            raise TypeError("Unsupported type for A")
     if context == "cpu" or isinstance(context, ContextCpu):
+        assert isinstance(A, scipy.sparse.spmatrix), (
+            "When using CPU context A must be a scipy.sparse matrix"
+            )
         if 'permc_spec' not in solverKwargs:
             solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
         if force_solver is None or force_solver == "scipySLU":

From aa7bd9e7719c45fece6594f3c8dc63539f50f63a Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Mon, 24 Nov 2025 13:03:03 +0100
Subject: [PATCH 09/28] Clean up luLU solver

---
 xobjects/sparse/solvers/CUDA/_luLU.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/xobjects/sparse/solvers/CUDA/_luLU.py b/xobjects/sparse/solvers/CUDA/_luLU.py
index cbe5fb2..4089ace 100644
--- a/xobjects/sparse/solvers/CUDA/_luLU.py
+++ b/xobjects/sparse/solvers/CUDA/_luLU.py
@@ -15,7 +15,6 @@
     scipy_available = True
 except ImportError:
     scipy_available = False
-from line_profiler import profile
 
 class CachedAbSolver:
     
@@ -181,7 +180,7 @@ def _get_opb(self, b):
         else:
             raise ValueError('b must be F-contiguous or C-contiguous.')
         return op_b
-    @profile
+    
     def solve(self, b):
         assert b.dtype == self.dtype
         assert self._get_opb(b) == self.op_b
@@ -257,7 +256,7 @@ def _init_solvers(self,trans = 'N', n_batches = 0):
         self.Lsolver = CachedAbSolver(self.L, b_sample, lower=True, transa=self.trans)
         self.Usolver = CachedAbSolver(self.U, b_sample, lower=False, transa=self.trans)
         # self.Usolver = CachedAbSolver(self.U.T, b_sample, lower=True, transa="T") #Can improve performance at times
-    @profile
+    
     def solve(self, rhs, trans='N'):
         """Solves linear system of equations with one or several right-hand sides.
 

From 72e7b8348f1125a98b05528f03220878cbf08d18 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Mon, 24 Nov 2025 15:08:36 +0100
Subject: [PATCH 10/28] Make fix_random_seed function not override returned
 value

---
 xobjects/test_helpers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xobjects/test_helpers.py b/xobjects/test_helpers.py
index 00d244d..7db897c 100644
--- a/xobjects/test_helpers.py
+++ b/xobjects/test_helpers.py
@@ -98,7 +98,8 @@ def wrapper(*args, **kwargs):
             rng_state = np.random.get_state()
             try:
                 np.random.seed(seed)
-                test_function(*args, **kwargs)
+                #Return value of function instead of None
+                return test_function(*args, **kwargs) 
             finally:
                 np.random.set_state(rng_state)
 

From c5b3441e9901a688788face6092e4b742bea29f9 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 25 Nov 2025 14:54:17 +0100
Subject: [PATCH 11/28] Add tests for sparse solvers

---
 tests/test_sparse_solvers.py | 118 +++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 tests/test_sparse_solvers.py

diff --git a/tests/test_sparse_solvers.py b/tests/test_sparse_solvers.py
new file mode 100644
index 0000000..40779ce
--- /dev/null
+++ b/tests/test_sparse_solvers.py
@@ -0,0 +1,118 @@
+import numpy as np
+import scipy.sparse as sp
+import xobjects as xo
+from xobjects.test_helpers import fix_random_seed
+import pytest
+
+PARAMS = [
+    ("scipySLU",  xo.ContextCpu()),
+    ("PyKLU",     xo.ContextCpu()),
+    ("cuDSS",     xo.ContextCupy()),
+    ("CachedSLU", xo.ContextCupy()),
+    ("cupySLU",   xo.ContextCupy()),
+]
+
+REL_TOL = 1e-4
+ABS_TOL = 1e-8
+SPARSE_SYSTEM_SIZE = 5000 # (n,n) matrix
+NUM_BATCHES = 10
+
+def batch_vectors_as_matrix(vector_list):
+    return np.asfortranarray(np.moveaxis(np.array(vector_list),0,-1))
+
+@fix_random_seed(1337)
+def make_random_sparse_system(n, nbatches, density=0.01):
+    A = sp.random(
+        n, n,
+        density=density,
+        format="csc",
+        random_state=np.random,
+        data_rvs=np.random.standard_normal
+    )
+    # Make it nonsingular & better conditioned:
+    # Add something on the diagonal so pivots aren't tiny/zero
+    A = A + sp.eye(n, format="csc") * 5.0  # tweak factor as you like
+    b_array = []
+    if nbatches == 0:
+        b = np.random.standard_normal(n)
+        b_array.append(b)
+    else:
+        for i in range(nbatches):
+            b = np.cos(2*i/(nbatches-1)*np.pi) * np.random.standard_normal(n)
+            b_array.append(b)
+        b = batch_vectors_as_matrix(b_array)
+    solver = sp.linalg.splu(A)
+    x = solver.solve(b)
+    return (A, b, x, b_array)
+
+@fix_random_seed(1337)
+def make_tridiagonal_system(n, nbatches):
+    main = 2.0 + np.abs(np.random.standard_normal(n))
+    lower = np.random.standard_normal(n-1)
+    upper = np.random.standard_normal(n-1)
+    A = sp.diags(
+        diagonals=[lower, main, upper],
+        offsets=[-1, 0, 1],
+        format="csc"
+    )
+    b_array = []
+    if nbatches == 0:
+        b = np.random.standard_normal(n)
+        b_array.append(b)
+    else:
+        for i in range(nbatches):
+            b = np.cos(2*i/(nbatches-1)*np.pi) * np.random.standard_normal(n)
+            b_array.append(b)
+        b = batch_vectors_as_matrix(b_array)
+    solver = sp.linalg.splu(A)
+    x = solver.solve(b)
+    return (A, b, x, b_array)
+
+random_system = make_random_sparse_system(SPARSE_SYSTEM_SIZE, 0)
+tridiag_system = make_tridiagonal_system(SPARSE_SYSTEM_SIZE, 0)
+
+@pytest.mark.parametrize("test_solver,test_context", PARAMS)
+@pytest.mark.parametrize("sparse_system", [random_system, tridiag_system])
+def test_vector_solve(test_solver, test_context, sparse_system):
+    A_sp, b_sp, x_sp, _ = sparse_system
+    np.testing.assert_allclose(b_sp, A_sp@x_sp, 
+                               rtol = REL_TOL, atol = ABS_TOL) #Verify Scipy result
+    if "Cpu" in str(test_context):
+        A = test_context.splike_lib.sparse.csc_matrix(A_sp)
+    if "Cupy" in str(test_context):
+        A = test_context.splike_lib.sparse.csr_matrix(A_sp)
+    b_test = test_context.nparray_to_context_array(b_sp)
+    b = test_context.nplike_lib.asfortranarray(b_test)
+    solver = xo.sparse.factorized_sparse_solver(A, 
+                                                force_solver = test_solver, 
+                                                context = test_context
+                                                )
+    x = solver.solve(b)
+    
+    xo.assert_allclose(x_sp, x, rtol = REL_TOL, atol = ABS_TOL)
+    xo.assert_allclose(b, A@x, rtol = REL_TOL, atol = ABS_TOL)
+
+random_system = make_random_sparse_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES)
+tridiag_system = make_tridiagonal_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES)
+
+@pytest.mark.parametrize("test_solver,test_context", PARAMS)
+@pytest.mark.parametrize("sparse_system", [random_system, tridiag_system])
+def test_batched_solve(test_solver, test_context, sparse_system):
+    A_sp, b_sp, x_sp, _ = sparse_system
+    np.testing.assert_allclose(b_sp, A_sp@x_sp, 
+                               rtol = REL_TOL, atol = ABS_TOL) #Verify Scipy result
+    if "Cpu" in str(test_context):
+        A = test_context.splike_lib.sparse.csc_matrix(A_sp)
+    if "Cupy" in str(test_context):
+        A = test_context.splike_lib.sparse.csr_matrix(A_sp)
+    b_test = test_context.nparray_to_context_array(b_sp)
+    b = test_context.nplike_lib.asfortranarray(b_test)
+    solver = xo.sparse.factorized_sparse_solver(A, 
+                                                n_batches = NUM_BATCHES,
+                                                force_solver = test_solver, 
+                                                context = test_context
+                                                )
+    x = solver.solve(b)
+    
+    xo.assert_allclose(x_sp, x, rtol = REL_TOL, atol = ABS_TOL)
+    xo.assert_allclose(b, A@x, rtol = REL_TOL, atol = ABS_TOL)
\ No newline at end of file

From 4d455d9dc6e343cd1e896846e6d13a8c9e85f8d9 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 25 Nov 2025 14:55:32 +0100
Subject: [PATCH 12/28] Stop warning when using force_solver

---
 xobjects/sparse/sparse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xobjects/sparse/sparse.py b/xobjects/sparse/sparse.py
index c4cf5ea..e844940 100644
--- a/xobjects/sparse/sparse.py
+++ b/xobjects/sparse/sparse.py
@@ -51,7 +51,7 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
         if 'permc_spec' not in solverKwargs:
             solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
         if force_solver is None or force_solver == "scipySLU":
-            if A.shape[0]*n_batches < 10**5:
+            if A.shape[0]*n_batches < 10**5 and force_solver is None:
                 import warnings
                 warnings.warn("For small matrices, using PyKLU " 
                               "can provide improved performance")

From aae4d2fe871fd351864356bbe17acd2976e550f2 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 25 Nov 2025 14:56:03 +0100
Subject: [PATCH 13/28] Attempt to fix deallocation exception in cuDSS solver

---
 xobjects/sparse/solvers/CUDA/_cuDSSLU.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xobjects/sparse/solvers/CUDA/_cuDSSLU.py b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
index 018ddc2..3ca7bfa 100644
--- a/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
+++ b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
@@ -95,4 +95,7 @@ def close(self):
         try:
             self.__exit__(None, None, None)  # DirectSolver is context-manageable
         except Exception:
-            pass
\ No newline at end of file
+            pass
+        
+    def __dealloc__(self):
+        self.close()
\ No newline at end of file

From 30babfc02cab1b8c10b314eee48e4debbfc40b5e Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 27 Nov 2025 14:44:14 +0100
Subject: [PATCH 14/28] Clean up sparse API

---
 xobjects/sparse/{sparse.py => _sparse.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename xobjects/sparse/{sparse.py => _sparse.py} (100%)

diff --git a/xobjects/sparse/sparse.py b/xobjects/sparse/_sparse.py
similarity index 100%
rename from xobjects/sparse/sparse.py
rename to xobjects/sparse/_sparse.py

From 2c64edbb0300681be4d95ad9ff7f4a3dc42fc88a Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 27 Nov 2025 15:11:00 +0100
Subject: [PATCH 15/28] Refactor sparse package structure

---
 xobjects/sparse/__init__.py              | 2 +-
 xobjects/sparse/_sparse.py               | 2 +-
 xobjects/sparse/solvers/CPU/__init__.py  | 7 ++++++-
 xobjects/sparse/solvers/CUDA/__init__.py | 9 ++++++---
 xobjects/sparse/solvers/__init__.py      | 2 +-
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/xobjects/sparse/__init__.py b/xobjects/sparse/__init__.py
index b10286c..2fa42ef 100644
--- a/xobjects/sparse/__init__.py
+++ b/xobjects/sparse/__init__.py
@@ -1,3 +1,3 @@
-from .sparse import factorized_sparse_solver
+from ._sparse import factorized_sparse_solver
 from . import solvers
 __all__ = ["factorized_sparse_solver","solvers"]
diff --git a/xobjects/sparse/_sparse.py b/xobjects/sparse/_sparse.py
index e844940..14e65b1 100644
--- a/xobjects/sparse/_sparse.py
+++ b/xobjects/sparse/_sparse.py
@@ -57,7 +57,7 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                               "can provide improved performance")
             solver = scipy.sparse.linalg.splu(A.tocsc(),**solverKwargs)
         elif force_solver == "PyKLU":
-            import PyKLU.klu as PyKLU
+            import PyKLU
             solver = PyKLU.Klu(A.tocsc())
         else:
             raise ValueError("Unrecognized CPU Sparse solver. Available options: "
diff --git a/xobjects/sparse/solvers/CPU/__init__.py b/xobjects/sparse/solvers/CPU/__init__.py
index 3c30baf..e6b243d 100644
--- a/xobjects/sparse/solvers/CPU/__init__.py
+++ b/xobjects/sparse/solvers/CPU/__init__.py
@@ -1,2 +1,7 @@
 from scipy.sparse.linalg import splu as scipySuperLU
-__all__ = ["scipySuperLU"]
\ No newline at end of file
+__all__ = ["scipySuperLU"]
+try:
+    from PyKLU import Klu as KLUSuperLU
+    __all__.append("KLUSuperLU")
+except ImportError:
+    pass
\ No newline at end of file
diff --git a/xobjects/sparse/solvers/CUDA/__init__.py b/xobjects/sparse/solvers/CUDA/__init__.py
index 20f7537..d1d8789 100644
--- a/xobjects/sparse/solvers/CUDA/__init__.py
+++ b/xobjects/sparse/solvers/CUDA/__init__.py
@@ -2,12 +2,15 @@
 try:
     from ._cuDSSLU import DirectSolverSuperLU as cuDSSSuperLU
     __all__.append("cuDSSSuperLU")
-except:
+except (ModuleNotFoundError,ImportError):
     pass
 try:
     from ._luLU import luLU as CachedSuperLU
     __all__.append("CachedSuperLU")
     from cupyx.scipy.sparse.linalg import splu as CupySuperLU
     __all__.append("CupySuperLU")
-except:
-    pass
\ No newline at end of file
+except (ModuleNotFoundError,ImportError):
+    pass
+
+if not __all__:
+    raise ImportError("Failed to import any CUDA-based sparse solver")
\ No newline at end of file
diff --git a/xobjects/sparse/solvers/__init__.py b/xobjects/sparse/solvers/__init__.py
index c2201f6..6804061 100644
--- a/xobjects/sparse/solvers/__init__.py
+++ b/xobjects/sparse/solvers/__init__.py
@@ -3,5 +3,5 @@
 try:
     from . import CUDA
     __all__.append("CUDA")
-except:
+except ImportError:
     pass

From c9a6f4a18346061fc9a992d84718ab60fd7f882d Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 27 Nov 2025 16:11:34 +0100
Subject: [PATCH 16/28] Make contexts give error when unavailable

---
 xobjects/context_cupy.py     | 5 +++++
 xobjects/context_pyopencl.py | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/xobjects/context_cupy.py b/xobjects/context_cupy.py
index 3838376..651cae5 100644
--- a/xobjects/context_cupy.py
+++ b/xobjects/context_cupy.py
@@ -398,6 +398,11 @@ def __init__(
         default_shared_mem_size_bytes=0,
         device=None,
     ):
+        if not _enabled:
+            raise ModuleNotAvailable(
+                "cupy is not installed. " "ContextCupy is not available!"
+                )
+        
         if device is not None:
             cupy.cuda.Device(device).use()
 
diff --git a/xobjects/context_pyopencl.py b/xobjects/context_pyopencl.py
index c61e21e..583394c 100644
--- a/xobjects/context_pyopencl.py
+++ b/xobjects/context_pyopencl.py
@@ -126,7 +126,12 @@ def __init__(
             ContextPyopencl: context object.
 
         """
-
+        
+        if not _enabled:
+            raise ModuleNotAvailable(
+                "pyopencl is not installed. ContextPyopencl is not available!"
+            )
+        
         super().__init__()
 
         # TODO assume one device only

From 82e5dbec69712efef4e17ce465c828561a6611a7 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 27 Nov 2025 16:19:33 +0100
Subject: [PATCH 17/28] Add ModuleNotAvailableError for unavailable optional
 modules

---
 xobjects/context.py          | 4 ++++
 xobjects/context_cupy.py     | 3 ++-
 xobjects/context_pyopencl.py | 5 +++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/xobjects/context.py b/xobjects/context.py
index effd95a..bbd57de 100644
--- a/xobjects/context.py
+++ b/xobjects/context.py
@@ -203,6 +203,10 @@ def __init__(self, message="Module not available"):
     def __getattr__(self, attr):
         raise NameError(self.message)
 
+class ModuleNotAvailableError(ModuleNotFoundError):
+    """Raised when an optional dependency is missing."""
+    pass
+
 
 class XContext(ABC):
     minimum_alignment = 1
diff --git a/xobjects/context_cupy.py b/xobjects/context_cupy.py
index 651cae5..d1b3b87 100644
--- a/xobjects/context_cupy.py
+++ b/xobjects/context_cupy.py
@@ -18,6 +18,7 @@
     classes_from_kernels,
     sort_classes,
     sources_from_classes,
+    ModuleNotAvailableError,
 )
 from .linkedarray import BaseLinkedArray
 from .specialize_source import specialize_source
@@ -399,7 +400,7 @@ def __init__(
         device=None,
     ):
         if not _enabled:
-            raise ModuleNotAvailable(
+            raise ModuleNotAvailableError(
                 "cupy is not installed. " "ContextCupy is not available!"
                 )
         
diff --git a/xobjects/context_pyopencl.py b/xobjects/context_pyopencl.py
index 583394c..196eb17 100644
--- a/xobjects/context_pyopencl.py
+++ b/xobjects/context_pyopencl.py
@@ -18,6 +18,7 @@
     classes_from_kernels,
     sort_classes,
     sources_from_classes,
+    ModuleNotAvailableError,
 )
 from .linkedarray import BaseLinkedArray
 from .specialize_source import specialize_source
@@ -126,9 +127,9 @@ def __init__(
             ContextPyopencl: context object.
 
         """
-        
+
         if not _enabled:
-            raise ModuleNotAvailable(
+            raise ModuleNotAvailableError(
                 "pyopencl is not installed. ContextPyopencl is not available!"
             )
         

From 5bc9f99a7784d3919267106429c44259f767fa31 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 27 Nov 2025 16:30:56 +0100
Subject: [PATCH 18/28] Improve tests for sparse solvers. Now using relative
 residuals

---
 tests/test_sparse_solvers.py     | 84 ++++++++++++++++++++++++--------
 xobjects/sparse/_test_helpers.py | 56 +++++++++++++++++++++
 2 files changed, 119 insertions(+), 21 deletions(-)
 create mode 100644 xobjects/sparse/_test_helpers.py

diff --git a/tests/test_sparse_solvers.py b/tests/test_sparse_solvers.py
index 40779ce..b462b62 100644
--- a/tests/test_sparse_solvers.py
+++ b/tests/test_sparse_solvers.py
@@ -2,20 +2,58 @@
 import scipy.sparse as sp
 import xobjects as xo
 from xobjects.test_helpers import fix_random_seed
+from xobjects.sparse import _test_helpers as sptest
+from xobjects.context import ModuleNotAvailableError
+import warnings
 import pytest
 
-PARAMS = [
-    ("scipySLU",  xo.ContextCpu()),
-    ("PyKLU",     xo.ContextCpu()),
-    ("cuDSS",     xo.ContextCupy()),
-    ("CachedSLU", xo.ContextCupy()),
-    ("cupySLU",   xo.ContextCupy()),
-]
-
-REL_TOL = 1e-4
-ABS_TOL = 1e-8
-SPARSE_SYSTEM_SIZE = 5000 # (n,n) matrix
+'''
+The following tests rely on computing the relative residual of the solution
+The relative residual can be defined as:
+                               || A * x - b ||
+                        η = ---------------------
+                             ||A||*||x|| + ||b||
+
+Typically, the expected value for this quantity is:
+* Ideally: 1e-12 - 1e-14
+* Ill-conditioned systems: 1e-9 - 1e-10
+
+In this module, we evaluate the residual as follows:
+* We compare the residual of the reference solver (scipy) with ABS_TOL
+* We compare the residual of the KLU solver with ABS_TOL
+* We ensure that the residual of the KLU Solver is within TOLERANCE_FACTOR
+  of the reference solver. 
+  
+For reference, the machine precision for FP64 is ~2.2e-16 (PRECISION)
+
+Note: The completely random sparse system is more prone to failing the tests
+due to numerical noise, often requires looser tolerances. Still worth including
+but if testing larger systems, could potentially be omitted.
+'''
+
+cpu_tests = [
+        ("scipySLU",  xo.ContextCpu()),
+        ("PyKLU",     xo.ContextCpu()),
+        ]
+
+cupy_tests = []
+try:
+    cupy_tests = [
+        ("cuDSS",     xo.ContextCupy()),
+        ("CachedSLU", xo.ContextCupy()),
+        ("cupySLU",   xo.ContextCupy()),
+    ]
+except ModuleNotAvailableError:
+    warnings.warn("!!!ContextCupy unavailable. "
+                  "Skipping tests for Cupy Solvers!!!")
+
+PARAMS = cpu_tests + cupy_tests
+
+SPARSE_SYSTEM_SIZE = 2000 # (n,n) matrix
 NUM_BATCHES = 10
+PRECISION = np.finfo(float).eps
+ABS_TOL = 1e-12
+TOLERANCE_FACTOR = 2
 
 def batch_vectors_as_matrix(vector_list):
     return np.asfortranarray(np.moveaxis(np.array(vector_list),0,-1))
@@ -75,8 +113,10 @@ def make_tridiagonal_system(n, nbatches):
 @pytest.mark.parametrize("sparse_system", [random_system, tridiag_system])
 def test_vector_solve(test_solver, test_context, sparse_system):
     A_sp, b_sp, x_sp, _ = sparse_system
-    np.testing.assert_allclose(b_sp, A_sp@x_sp, 
-                               rtol = REL_TOL, atol = ABS_TOL) #Verify Scipy result
+    assert not sptest.issymmetric(A_sp)
+
+    scipy_residual = sptest.rel_residual(A_sp,x_sp,b_sp)
+    
     if "Cpu" in str(test_context):
         A = test_context.splike_lib.sparse.csc_matrix(A_sp)
     if "Cupy" in str(test_context):
@@ -88,9 +128,10 @@ def test_vector_solve(test_solver, test_context, sparse_system):
                                                 context = test_context
                                                 )
     x = solver.solve(b)
-    
-    xo.assert_allclose(x_sp, x, rtol = REL_TOL, atol = ABS_TOL)
-    xo.assert_allclose(b, A@x, rtol = REL_TOL, atol = ABS_TOL)
+
+    solver_residual = sptest.rel_residual(A,x,b)
+    sptest.assert_residual_ok(scipy_residual,solver_residual, 
+                              abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR)
 
 random_system = make_random_sparse_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES)
 tridiag_system = make_tridiagonal_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES)
@@ -99,8 +140,8 @@ def test_vector_solve(test_solver, test_context, sparse_system):
 @pytest.mark.parametrize("sparse_system", [random_system, tridiag_system])
 def test_batched_solve(test_solver, test_context, sparse_system):
     A_sp, b_sp, x_sp, _ = sparse_system
-    np.testing.assert_allclose(b_sp, A_sp@x_sp, 
-                               rtol = REL_TOL, atol = ABS_TOL) #Verify Scipy result
+    assert not sptest.issymmetric(A_sp)
+    scipy_residual = sptest.rel_residual(A_sp,x_sp,b_sp)
     if "Cpu" in str(test_context):
         A = test_context.splike_lib.sparse.csc_matrix(A_sp)
     if "Cupy" in str(test_context):
@@ -113,6 +154,7 @@ def test_batched_solve(test_solver, test_context, sparse_system):
                                                 context = test_context
                                                 )
     x = solver.solve(b)
-    
-    xo.assert_allclose(x_sp, x, rtol = REL_TOL, atol = ABS_TOL)
-    xo.assert_allclose(b, A@x, rtol = REL_TOL, atol = ABS_TOL)
\ No newline at end of file
+
+    solver_residual = sptest.rel_residual(A,x,b)
+    sptest.assert_residual_ok(scipy_residual,solver_residual, 
+                              abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR)
\ No newline at end of file
diff --git a/xobjects/sparse/_test_helpers.py b/xobjects/sparse/_test_helpers.py
new file mode 100644
index 0000000..7b82663
--- /dev/null
+++ b/xobjects/sparse/_test_helpers.py
@@ -0,0 +1,56 @@
+import numpy.linalg as npl
+import scipy.sparse.linalg as scspl
+from scipy.sparse import issparse
+
+def issymmetric(A, tol=0):
+    if A.shape[0] != A.shape[1]:
+        return False
+    diff = A - A.T
+    if tol == 0:
+        return diff.nnz == 0
+    else:
+        # tolerance-based check
+        return abs(diff).max() <= tol
+
+def rel_residual(A,x,b):
+    if hasattr(A, "get"):
+        A = A.get()
+    if hasattr(x, "get"):
+        x = x.get()
+    if hasattr(b, "get"):
+        b = b.get()
+    assert issparse(A), ("A must be a sparse matrix")
+
+    return npl.norm(A@x - b) / (scspl.norm(A)*npl.norm(x) + npl.norm(b))
+
+def assert_residual_ok(res_ref, res_solver,
+                       abs_tol=1e-12,
+                       factor=10):
+    """
+    Check that our solver's residual is both:
+      - absolutely small enough (abs_tol),
+      - not catastrophically worse than the reference (factor * res_ref).
+    """
+    # sanity: reference solver itself should be good
+    assert res_ref < abs_tol, f"Reference residual too large: {res_ref}"
+
+    # absolute bound
+    assert res_solver < abs_tol, (
+        f"Residual {res_solver} exceeds absolute tolerance {abs_tol}"
+    )
+
+    # relative bound vs reference
+    assert res_solver <= factor * res_ref, (
+        f"Residual {res_solver} not within factor {factor} of "
+        f"reference residual {res_ref}"
+    )
+
+# ---- Unused rn, but could be useful for tests ----
+
+def assert_close_to_precision(value, precision):
+    assert value <= precision, (f"Value {value} not within precision {precision}")
+
+def assert_residual_close(reference_residual, residual, tolerance = 10):
+    assert residual <= tolerance*reference_residual, (
+        f"Residual {residual} not within tolerance "
+        f"O({tolerance}) of reference residual {reference_residual}")
\ No newline at end of file

From 2bd4a14296d6f975094c429989ef005e2d3ff523 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 10:33:40 +0100
Subject: [PATCH 19/28] Fix relative residual formula in tests

---
 tests/test_sparse_solvers.py     | 2 +-
 xobjects/sparse/_test_helpers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_sparse_solvers.py b/tests/test_sparse_solvers.py
index b462b62..e490335 100644
--- a/tests/test_sparse_solvers.py
+++ b/tests/test_sparse_solvers.py
@@ -12,7 +12,7 @@
 The relative residual can be defined as:
                                || A * x - b ||
                         η = ---------------------
-                             ||A||*||x|| + ||b||
+                                    ||b||
 
 Typically, the expected value for this quantity is:
 * Ideally: 1e-12 - 1e-14
diff --git a/xobjects/sparse/_test_helpers.py b/xobjects/sparse/_test_helpers.py
index 7b82663..313aa39 100644
--- a/xobjects/sparse/_test_helpers.py
+++ b/xobjects/sparse/_test_helpers.py
@@ -21,7 +21,7 @@ def rel_residual(A,x,b):
         b = b.get()
     assert issparse(A), ("A must be a sparse matrix")
 
-    return npl.norm(A@x - b) / (scspl.norm(A)*npl.norm(x) + npl.norm(b))
+    return npl.norm(A@x - b) / (npl.norm(b))
 
 def assert_residual_ok(res_ref, res_solver,
                        abs_tol=1e-12,

From 8c121048b6ca6b0fc454378bbdc36d316b9cfdcc Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 11:38:40 +0100
Subject: [PATCH 20/28] Make sparse solver choice verbose and document

---
 xobjects/sparse/_sparse.py | 251 +++++++++++++++++++++++++++++++++----
 1 file changed, 225 insertions(+), 26 deletions(-)

diff --git a/xobjects/sparse/_sparse.py b/xobjects/sparse/_sparse.py
index 14e65b1..dc68340 100644
--- a/xobjects/sparse/_sparse.py
+++ b/xobjects/sparse/_sparse.py
@@ -6,6 +6,8 @@
 from ..context_cupy import ContextCupy
 from ..context_pyopencl import ContextPyopencl
 from .solvers._abstract_solver import SuperLUlikeSolver
+from ..general import _print
+
 try:
     from cupy import ndarray as cparray
     import cupyx.scipy.sparse
@@ -31,11 +33,184 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
                                               "cupy",
                                               "pyopencl"],
                                       XContext
-                                            ] = None
+                                            ] = None,
+                             verbose: bool = False
                             ) -> SuperLUlikeSolver:
+    """
+    Build and return a factorized sparse linear solver on CPU or GPU.
+
+    This function inspects the provided sparse matrix and execution context
+    and returns a *factorized* solver object (SuperLU-like). The solver can
+    then be reused to efficiently solve multiple linear systems with the same
+    matrix `A`.
+
+    The actual backend is chosen automatically based on:
+      * The requested/derived `context` ("cpu", "cupy", "pyopencl"),
+      * The availability of optional libraries (PyKLU, cuDSS, CuPy/CUSPARSE),
+      * And the optional `force_solver` argument.
+
+    On CPU:
+      * Default: try PyKLU, fall back to `scipy.sparse.linalg.splu`.
+      * You may force `"scipySLU"` or `"PyKLU"` explicitly.
+
+    On CUDA/CuPy:
+      * Default: try cuDSS (`DirectSolverSuperLU`), then fall back to:
+          - `cupyx.scipy.sparse.linalg.splu` if CUSPARSE `csrsm2` is available, or
+          - a cached SuperLU-based solver (`luLU`) and finally `cupyx.splu`.
+      * You may force `"cuDSS"`, `"CachedSLU"`, or `"cupySLU"` explicitly.
+
+    PyOpenCL:
+      * Currently not supported and will raise `NotImplementedError`.
+
+    Parameters
+    ----------
+    A : scipy.sparse.spmatrix or cupyx.scipy.sparse.spmatrix
+        Sparse system matrix to factorize.
+
+        The matrix is internally converted to the format expected by the
+        chosen backend:
+
+          * CPU context: converted to CSC (`scipy.sparse.csc_matrix`).
+          * CuPy/GPU context: converted to CSR (`cupyx.scipy.sparse.csr_matrix`).
+
+        For **best performance**, you should pass `A` already in the
+        preferred format to avoid extra conversions:
+
+          * If the context is (or will usually be) **CPU**, provide `A`
+            as a CSC matrix (`A.tocsc()`).
+          * If the context is **GPU/CuPy**, provide `A` as a CSR matrix
+            (`A.tocsr()` or `cupyx.scipy.sparse.csr_matrix`).
+
+        If `context` is `None`, it is still inferred from the type of `A`
+        and the availability of CuPy, e.g.:
+
+          * SciPy sparse or NumPy array → `"cpu"`,
+          * CuPy sparse or CuPy array → `"cupy"`,
+          * otherwise a `TypeError` is raised.
+
+    n_batches : int, optional
+        Controls the expected shape of the right-hand side (RHS) for GPU
+        solvers and hence whether solves are treated as single or batched:
+
+        * If ``n_batches == 0`` (default), the solver is configured for
+          single-RHS solves and expects a vector RHS of shape ``(n,)``.
+        * If ``n_batches > 0``, the solver is configured for batched solves
+          and expects a 2D RHS array of shape ``(n, n_batches)`` (i.e.
+          ``nrhs = n_batches``).
+
+        This argument is primarily used by CUDA-based solvers (e.g. cuDSS and
+        cached SuperLU) to preconfigure internal data structures for batched
+        solves. It has no effect for CPU-based solvers.
+
+    force_solver : {"scipySLU", "PyKLU", "cuDSS", "CachedSLU", "cupySLU"}, optional
+        If provided, forces the use of a specific backend instead of the
+        automatic selection:
+
+        * `"scipySLU"` : Use `scipy.sparse.linalg.splu` (CPU).
+        * `"PyKLU"`    : Use the `PyKLU.Klu` solver (CPU).
+        * `"cuDSS"`    : Use CUDA/cuDSS-based `DirectSolverSuperLU` (GPU).
+        * `"CachedSLU"`: Use CUDA cached SuperLU (`luLU`) (GPU).
+        * `"cupySLU"`  : Use `cupyx.scipy.sparse.linalg.splu` (GPU).
+
+        Using a solver that does not match the current `context` will result
+        in a `ValueError`.
+
+    solverKwargs : dict, optional
+        Extra keyword arguments forwarded to the underlying solver constructor.
+        If `None`, an empty dict is used.
+
+        Some backends make use of `permc_spec` (matrix permutation strategy).
+        When not explicitly provided and appropriate, this function sets
+        `permc_spec="MMD_AT_PLUS_A"` as a sensible default for the matrices that 
+        will typically be encountered in an xobjects workflow.
+
+    context : {"cpu", "cupy", "pyopencl"} or XContext, optional
+        Execution context. Can be either:
+
+        * A string:
+            - `"cpu"`: Use CPU-based solvers (SciPy / PyKLU).
+            - `"cupy"`: Use CuPy/CUDA-based solvers.
+            - `"pyopencl"`: PyOpenCL context (currently unsupported).
+        * A context object instance:
+            - `ContextCpu`
+            - `ContextCupy`
+            - `ContextPyopencl`
+
+        If `None`, the context is inferred from `A` as described above.
+
+    verbose : bool, optional
+        If `True`, prints debug messages describing the solver-selection
+        process, fallbacks, and the final solver that is returned.
+
+    Returns
+    -------
+    SuperLUlikeSolver
+        A factorized solver object compatible with SciPy’s `splu`-like
+        interface (i.e. typically exposing a `solve` method and related
+        accessors). The exact concrete type depends on the backend:
+          * CPU:
+              - `scipy.sparse.linalg.SuperLU` (for `"scipySLU"`),
+              - `PyKLU.Klu` (for `"PyKLU"`).
+          * CUDA/CuPy:
+              - `DirectSolverSuperLU` (cuDSS),
+              - `luLU` (cached SuperLU),
+              - `cupyx.scipy.sparse.linalg.SuperLU` (for `"cupySLU"`).
+
+    Raises
+    ------
+    TypeError
+        If the type of `A` is unsupported when inferring the context.
+
+    AssertionError
+        If `A` does not match the required type for the chosen context
+
+    ModuleNotFoundError
+        If a requested solver backend depends on a module that is not
+        installed (e.g. CuPy, PyKLU, cuDSS), and no fallback is available.
+
+    RuntimeError
+        If a requested GPU solver fails during initialization.
+
+    NotImplementedError
+        If `context` is `"pyopencl"` or `ContextPyopencl`, since no sparse
+        solver is currently implemented for that backend.
+
+    ValueError
+        If an invalid `context` string is provided, or if `force_solver`
+        does not match any known solver for the active context.
+
+    Notes
+    -----
+    - For best performance on CPU, PyKLU is preferred when available.
+    - For CuPy/CUDA, cuDSS is preferred when available, and this function
+      will automatically fall back to other solvers if cuDSS is not present
+      or fails at runtime.
+    - The returned solver is *factorized* and should be reused to solve
+      multiple right-hand sides efficiently.
+
+    Examples
+    --------
+    Factorize a SciPy sparse matrix on CPU with automatic solver selection:
+
+    >>> A = scipy.sparse.random(1000, 1000, density=0.01, format="csr")
+    >>> solver = factorized_sparse_solver(A)
+    >>> x = solver.solve(b)
+
+    Explicitly request the SciPy SuperLU solver on CPU:
+
+    >>> solver = factorized_sparse_solver(A, force_solver="scipySLU")
+
+    Using CuPy and cuDSS (requires CuPy and cuDSS bindings):
+
+    >>> A_gpu = cupyx.scipy.sparse.csr_matrix(A)
+    >>> solver = factorized_sparse_solver(A_gpu, context="cupy")
+    >>> x_gpu = solver.solve(b_gpu)
+    """
     if solverKwargs is None:
         solverKwargs = {}
     if context is None:
+        dbugprint(verbose, "No context provided. " \
+                           "Context will be inferred from matrix")
         if isinstance(A, scipy.sparse.spmatrix) or isinstance(A, nparray):
             context = 'cpu'
         elif (_cupy_available and 
@@ -48,17 +223,28 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
         assert isinstance(A, scipy.sparse.spmatrix), (
             "When using CPU context A must be a scipy.sparse matrix"
             )
+        A = A.tocsc() # CPU Solvers require csc format
         if 'permc_spec' not in solverKwargs:
             solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
-        if force_solver is None or force_solver == "scipySLU":
-            if A.shape[0]*n_batches < 10**5 and force_solver is None:
-                import warnings
-                warnings.warn("For small matrices, using PyKLU " 
-                              "can provide improved performance")
-            solver = scipy.sparse.linalg.splu(A.tocsc(),**solverKwargs)
+        if force_solver is None:
+            dbugprint(verbose, "No solver requested. " \
+                               "Picking best solver for CPU Context")
+            try:
+                dbugprint(verbose, "Attempting to use PyKLU")
+                import PyKLU
+                solver = PyKLU.Klu(A)
+                dbugprint(verbose, "PyKLU succeeded")
+            except (ModuleNotFoundError, RuntimeError) as e:
+                dbugprint(verbose, "PyKLU failed. " \
+                                   "Falling back to scipy.splu \n"
+                                  f"Encountered error: {e}")
+                
+                solver = scipy.sparse.linalg.splu(A,**solverKwargs)
+        elif force_solver == "scipySLU":
+            solver = scipy.sparse.linalg.splu(A,**solverKwargs)
         elif force_solver == "PyKLU":
             import PyKLU
-            solver = PyKLU.Klu(A.tocsc())
+            solver = PyKLU.Klu(A)
         else:
             raise ValueError("Unrecognized CPU Sparse solver. Available options: "
                              "scipySLU, PyKLU")
@@ -67,36 +253,44 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
         if not _cupy_available:
             raise ModuleNotFoundError("No cupy module found. " \
                                       "ContextCupy unavailable")
-        assert isinstance(A ,cupyx.scipy.sparse.csr_matrix), (
+        assert isinstance(A ,cupyx.scipy.sparse.spmatrix), (
                 "When using ContextCupy, input must be "
-                "cupyx.scipy.sparse.csr_matrix")
-
+                "cupyx.scipy.sparse matrix")
+        
+        A = A.tocsr() # GPU solvers require csr format
         if force_solver is not None and force_solver != "cuDSS":
             if 'permc_spec' not in solverKwargs:
                 solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
         if force_solver is None:
+            dbugprint(verbose, "No solver requested. " \
+                               "Picking best solver for Cupy Context")
             import warnings
             try:
+                dbugprint(verbose, "Attempting to use cuDSS Solver")
                 from .solvers.CUDA._cuDSSLU import DirectSolverSuperLU
                 solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
+                dbugprint(verbose, "cuDSS succeeded")
             except (ModuleNotFoundError, RuntimeError) as e:
-                warnings.warn("cuDSS not available. " 
-                              "Falling back to Cached-SuperLU (spsm) "
-                              f"Encountered Error: {e}")
+                dbugprint(verbose, "cuDSS failed. \n"
+                                    f"Encountered Error: {e}")
+                warnings.warn("cuDSS not available. Performance will be degraded")                               
                 if 'permc_spec' not in solverKwargs:
                     solverKwargs = solverKwargs | {"permc_spec":"MMD_AT_PLUS_A"}
-                try:
-                    if cusparse.check_availability('csrsm2'):
-                        raise RuntimeError("csrsm2 is avaiable. "
-                                           "cupy SuperLU performs better "
-                                           "than Cached-SuperLU (spsm)")
-                    from .solvers.CUDA._luLU import luLU
-                    solver = luLU(A, n_batches = n_batches, **solverKwargs)
-                except RuntimeError as e:
-                    warnings.warn("Cached-SuperLU (spsm) solver failed. " 
-                                  "Falling back to cupy SuperLU. "
-                                  f"Error encountered: {e}")
+                if cusparse.check_availability('csrsm2'):
+                    dbugprint(verbose, "csrsm2 available. Using cupyx.splu solver")
                     solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)
+                else:
+                    try:
+                        dbugprint(verbose, "csrms2 unavailable. " \
+                                           "Attempting to use CachedSuperLU (spsm)")
+                        from .solvers.CUDA._luLU import luLU
+                        solver = luLU(A, n_batches = n_batches, **solverKwargs)
+                        dbugprint(verbose, "CachedSuperLU succeeded")
+                    except RuntimeError as e:
+                        dbugprint(verbose, "CachedSuperLU failed. \n"
+                                          f"Encountered error: {e} \n"
+                                           "Falling back to cupyx.splu with spsm")
+                        solver = cupyx.scipy.sparse.linalg.splu(A, **solverKwargs)
         elif force_solver == "cuDSS":
             from .solvers.CUDA._cuDSSLU import DirectSolverSuperLU
             solver = DirectSolverSuperLU(A, n_batches = n_batches, **solverKwargs)
@@ -114,4 +308,9 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
     else:
         raise ValueError("Invalid context. Available contexts are: " \
                          "cpu, cupy, pyopencl")
-    return solver
\ No newline at end of file
+    dbugprint(verbose, "Returning solver: " + str(solver))
+    return solver
+
+def dbugprint(verbose: bool, text: str):
+    if verbose:
+        _print("[xo.sparse] "+text)
\ No newline at end of file

From 79bc3499d44f173d87394a36aec55f335b187bf8 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 11:58:42 +0100
Subject: [PATCH 21/28] Update solver interface so that it is consistent
 accross platforms

---
 xobjects/sparse/solvers/CUDA/__init__.py | 26 ++++++++++++++++--------
 xobjects/sparse/solvers/__init__.py      |  8 ++------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/xobjects/sparse/solvers/CUDA/__init__.py b/xobjects/sparse/solvers/CUDA/__init__.py
index d1d8789..c8d6a4d 100644
--- a/xobjects/sparse/solvers/CUDA/__init__.py
+++ b/xobjects/sparse/solvers/CUDA/__init__.py
@@ -1,16 +1,24 @@
-__all__ = []
+from ....context import ModuleNotAvailableError
 try:
     from ._cuDSSLU import DirectSolverSuperLU as cuDSSSuperLU
-    __all__.append("cuDSSSuperLU")
-except (ModuleNotFoundError,ImportError):
-    pass
+except (ModuleNotFoundError,ImportError) as e:
+    def cuDSSSuperLU(*args, **kwargs):
+        raise ModuleNotAvailableError(
+            "cuDSSSuperLU is not available. Could not import required backend."
+            ) from e
 try:
     from ._luLU import luLU as CachedSuperLU
-    __all__.append("CachedSuperLU")
+except (ModuleNotFoundError,ImportError):
+    def CachedSuperLU(*args, **kwargs):
+        raise ModuleNotAvailableError(
+            "CachedSuperLU is not available. Could not import required backend."
+            ) from e
+try:
     from cupyx.scipy.sparse.linalg import splu as CupySuperLU
-    __all__.append("CupySuperLU")
 except (ModuleNotFoundError,ImportError):
-    pass
+    def CupySuperLU(*args, **kwargs):
+        raise ModuleNotAvailableError(
+            "CupySuperLU is not available. Could not import required backend."
+            ) from e
 
-if not __all__:
-    raise ImportError("Failed to import any CUDA-based sparse solver")
\ No newline at end of file
+__all__ = ["cuDSSSuperLU", "CachedSuperLU", "CupySuperLU"]
\ No newline at end of file
diff --git a/xobjects/sparse/solvers/__init__.py b/xobjects/sparse/solvers/__init__.py
index 6804061..3721b50 100644
--- a/xobjects/sparse/solvers/__init__.py
+++ b/xobjects/sparse/solvers/__init__.py
@@ -1,7 +1,3 @@
 from . import CPU
-__all__ = ["CPU"]
-try:
-    from . import CUDA
-    __all__.append("CUDA")
-except ImportError:
-    pass
+from . import CUDA
+__all__ = ["CPU", "CUDA"]

From 9b2d052c9dc4b2d9b16e1237b78f98d2a46e88d8 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 12:12:56 +0100
Subject: [PATCH 22/28] Update import logic and error raising

---
 xobjects/sparse/solvers/CPU/__init__.py  | 13 +++++++++----
 xobjects/sparse/solvers/CUDA/__init__.py | 16 ++++++++--------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/xobjects/sparse/solvers/CPU/__init__.py b/xobjects/sparse/solvers/CPU/__init__.py
index e6b243d..a5f6275 100644
--- a/xobjects/sparse/solvers/CPU/__init__.py
+++ b/xobjects/sparse/solvers/CPU/__init__.py
@@ -1,7 +1,12 @@
 from scipy.sparse.linalg import splu as scipySuperLU
-__all__ = ["scipySuperLU"]
+from ....context import ModuleNotAvailableError
+
 try:
     from PyKLU import Klu as KLUSuperLU
-    __all__.append("KLUSuperLU")
-except ImportError:
-    pass
\ No newline at end of file
+except (ModuleNotFoundError,ImportError) as e:
+    def KLUSuperLU(*args, _import_err=e, **kwargs):
+        raise ModuleNotAvailableError(
+            "KLUSuperLU is not available. Could not import required backend."
+            ) from _import_err
+
+__all__ = ["scipySuperLU", "KLUSuperLU"]
\ No newline at end of file
diff --git a/xobjects/sparse/solvers/CUDA/__init__.py b/xobjects/sparse/solvers/CUDA/__init__.py
index c8d6a4d..f969a27 100644
--- a/xobjects/sparse/solvers/CUDA/__init__.py
+++ b/xobjects/sparse/solvers/CUDA/__init__.py
@@ -2,23 +2,23 @@
 try:
     from ._cuDSSLU import DirectSolverSuperLU as cuDSSSuperLU
 except (ModuleNotFoundError,ImportError) as e:
-    def cuDSSSuperLU(*args, **kwargs):
+    def cuDSSSuperLU(*args, _import_err=e, **kwargs):
         raise ModuleNotAvailableError(
             "cuDSSSuperLU is not available. Could not import required backend."
-            ) from e
+            ) from _import_err
 try:
     from ._luLU import luLU as CachedSuperLU
-except (ModuleNotFoundError,ImportError):
-    def CachedSuperLU(*args, **kwargs):
+except (ModuleNotFoundError,ImportError) as e:
+    def CachedSuperLU(*args, _import_err=e, **kwargs):
         raise ModuleNotAvailableError(
             "CachedSuperLU is not available. Could not import required backend."
-            ) from e
+            ) from _import_err
 try:
     from cupyx.scipy.sparse.linalg import splu as CupySuperLU
-except (ModuleNotFoundError,ImportError):
-    def CupySuperLU(*args, **kwargs):
+except (ModuleNotFoundError,ImportError) as e:
+    def CupySuperLU(*args, _import_err=e, **kwargs):
         raise ModuleNotAvailableError(
             "CupySuperLU is not available. Could not import required backend."
-            ) from e
+            ) from _import_err
 
 __all__ = ["cuDSSSuperLU", "CachedSuperLU", "CupySuperLU"]
\ No newline at end of file

From c225bd846175af63fa9927e0a9446e976bdca82e Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 12:50:52 +0100
Subject: [PATCH 23/28] Update documentation and naming

---
 xobjects/sparse/_sparse.py               | 16 ++++++++++++++--
 xobjects/sparse/solvers/CPU/__init__.py  | 13 ++++++-------
 xobjects/sparse/solvers/CUDA/__init__.py | 24 +++++++++++++-----------
 3 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/xobjects/sparse/_sparse.py b/xobjects/sparse/_sparse.py
index dc68340..0d7301c 100644
--- a/xobjects/sparse/_sparse.py
+++ b/xobjects/sparse/_sparse.py
@@ -109,7 +109,7 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
         * `"scipySLU"` : Use `scipy.sparse.linalg.splu` (CPU).
         * `"PyKLU"`    : Use the `PyKLU.Klu` solver (CPU).
         * `"cuDSS"`    : Use CUDA/cuDSS-based `DirectSolverSuperLU` (GPU).
-        * `"CachedSLU"`: Use CUDA cached SuperLU (`luLU`) (GPU).
+        * `"CachedSLU"`: Use CUDA cached spsm SuperLU (`luLU`) (GPU).
         * `"cupySLU"`  : Use `cupyx.scipy.sparse.linalg.splu` (GPU).
 
         Using a solver that does not match the current `context` will result
@@ -153,8 +153,20 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
               - `PyKLU.Klu` (for `"PyKLU"`).
           * CUDA/CuPy:
               - `DirectSolverSuperLU` (cuDSS),
-              - `luLU` (cached SuperLU),
+              - `luLU` (cached spsm SuperLU),
               - `cupyx.scipy.sparse.linalg.SuperLU` (for `"cupySLU"`).
+        
+        All returned solver objects implement a `solve(b)` method.
+        For **optimal performance across all backends**, the right-hand
+        side `b` should be passed as a **Fortran-contiguous (column-major)**
+        array:
+
+        * For a single RHS: shape ``(n,)`` or ``(n, 1)`` (Fortran contiguous).
+        * For multiple RHSs: shape ``(n, nrhs)`` with **Fortran layout**.
+
+        If `b` is not Fortran-contiguous, the solver will internally copy or
+        transform it, which can incur extra overhead—especially on CUDA/GPU
+        backends and in batched-solve scenarios.
 
     Raises
     ------
diff --git a/xobjects/sparse/solvers/CPU/__init__.py b/xobjects/sparse/solvers/CPU/__init__.py
index a5f6275..07b2c79 100644
--- a/xobjects/sparse/solvers/CPU/__init__.py
+++ b/xobjects/sparse/solvers/CPU/__init__.py
@@ -1,12 +1,11 @@
-from scipy.sparse.linalg import splu as scipySuperLU
-from ....context import ModuleNotAvailableError
-
+from scipy.sparse.linalg import splu as scipysplu
 try:
-    from PyKLU import Klu as KLUSuperLU
+    from PyKLU import Klu as KLU
 except (ModuleNotFoundError,ImportError) as e:
-    def KLUSuperLU(*args, _import_err=e, **kwargs):
+    def KLU(*args, _import_err=e, **kwargs):
+        from ....context import ModuleNotAvailableError
         raise ModuleNotAvailableError(
-            "KLUSuperLU is not available. Could not import required backend."
+            "KLU is not available. Could not import required backend."
             ) from _import_err
 
-__all__ = ["scipySuperLU", "KLUSuperLU"]
\ No newline at end of file
+__all__ = ["scipysplu", "KLU"]
\ No newline at end of file
diff --git a/xobjects/sparse/solvers/CUDA/__init__.py b/xobjects/sparse/solvers/CUDA/__init__.py
index f969a27..536fe49 100644
--- a/xobjects/sparse/solvers/CUDA/__init__.py
+++ b/xobjects/sparse/solvers/CUDA/__init__.py
@@ -1,24 +1,26 @@
-from ....context import ModuleNotAvailableError
 try:
-    from ._cuDSSLU import DirectSolverSuperLU as cuDSSSuperLU
+    from ._cuDSSLU import DirectSolverSuperLU as cuDSS
 except (ModuleNotFoundError,ImportError) as e:
-    def cuDSSSuperLU(*args, _import_err=e, **kwargs):
+    def cuDSS(*args, _import_err=e, **kwargs):
+        from ....context import ModuleNotAvailableError
         raise ModuleNotAvailableError(
-            "cuDSSSuperLU is not available. Could not import required backend."
+            "cuDSS is not available. Could not import required backend."
             ) from _import_err
 try:
-    from ._luLU import luLU as CachedSuperLU
+    from ._luLU import luLU as cachedSpSM
 except (ModuleNotFoundError,ImportError) as e:
-    def CachedSuperLU(*args, _import_err=e, **kwargs):
+    def cachedSpSM(*args, _import_err=e, **kwargs):
+        from ....context import ModuleNotAvailableError
         raise ModuleNotAvailableError(
-            "CachedSuperLU is not available. Could not import required backend."
+            "cachedSpSM is not available. Could not import required backend."
             ) from _import_err
 try:
-    from cupyx.scipy.sparse.linalg import splu as CupySuperLU
+    from cupyx.scipy.sparse.linalg import splu as cupysplu
 except (ModuleNotFoundError,ImportError) as e:
-    def CupySuperLU(*args, _import_err=e, **kwargs):
+    def cupysplu(*args, _import_err=e, **kwargs):
+        from ....context import ModuleNotAvailableError
         raise ModuleNotAvailableError(
-            "CupySuperLU is not available. Could not import required backend."
+            "cupysplu is not available. Could not import required backend."
             ) from _import_err
 
-__all__ = ["cuDSSSuperLU", "CachedSuperLU", "CupySuperLU"]
\ No newline at end of file
+__all__ = ["cuDSS", "cachedSpSM", "cupysplu"]
\ No newline at end of file

From 031098195c67f9c60e6f9c0fe96010666641437f Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 13:06:51 +0100
Subject: [PATCH 24/28] Add example for sparse solvers

---
 examples/solve_sparse_system.py | 74 +++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 examples/solve_sparse_system.py

diff --git a/examples/solve_sparse_system.py b/examples/solve_sparse_system.py
new file mode 100644
index 0000000..4e02673
--- /dev/null
+++ b/examples/solve_sparse_system.py
@@ -0,0 +1,74 @@
+import xobjects as xo
+import scipy.sparse as sp
+import numpy as np
+from xobjects.sparse._test_helpers import rel_residual
+
+'''
+The goal of this example is to provide a short user guide for the xo.sparse module.
+
+The xo.sparse module can be used to solve sparse linear systems of equations:
+                                    A*x = b
+where A is a sparse matrix.
+
+Currently this module only supports CPU and Cupy contexts. This module contains a 
+variety of solvers for different contexts, with consistent APIs. The intended use
+is to reuse the same LHS for many solves, so the solvers work as follows:
+
+solver(A) # Performs decomposition/factorization
+solver.solve(b) # Solves Ax = b using precomputed factors
+
+For optimal performance accross backends b should be a column-major (F Contiguous)
+array or vector.
+
+The intended interface for this module is:
+
+xo.sparse.factorized_sparse_solver()
+
+The function includes detailed documentation for usage, but in short, it returns
+the best performing solver based on the context and available modules. If the 
+context is not explicitly defined, it is inferred based on the input matrix.
+
+This is how modules that build upon this functionality within xsuite should interact
+with the xo.sparse module, so that cross-platform compatibility is guaranteed.
+
+For development and convenience purposes xo.sparse provides the:
+xo.sparse.solvers module
+
+which provides the following options:
+xo.sparse.solvers.CPU.
+    - scipysplu : Alias for scipy SuperLU 
+    - KLUSuperLU : Alias for PyKLU
+xo.sparse.solvers.CPU.
+    - cuDSS : nvmath.sparse.advanced.DirectSolver Wrapper with a SuperLU-like interface
+    - cachedSpSM : Rewrite of cupy's SuperLU to cache the SpSM analysis step
+                   offering massive speedups compared to cupy splu when the only
+                   available backend is SpSM
+    - cupysplu : Alias for scipy SuperLU 
+'''
+
+# Example: solve small matrix system
+n = 5
+# Create matrix
+main = 2.0 + np.abs(np.random.standard_normal(n))
+lower = np.random.standard_normal(n-1)
+upper = np.random.standard_normal(n-1)
+A = sp.diags(
+    diagonals=[lower, main, upper],
+    offsets=[-1, 0, 1],
+    format="csc"
+)
+
+# Create solver:
+print("Solver selection process: ")
+solver = xo.sparse.factorized_sparse_solver(A, verbose = True)
+
+# Generate random vector to solve:
+b = np.random.standard_normal(n)
+
+# Solve system
+x = solver.solve(b)
+
+# Calculate relative residual to assess solver:
+res = rel_residual(A,x,b)
+print("Relative residual of solution ", res)
+print("Residual should be small (<10^-12)")

From af1f74921b8d49d6ae483677f04fb94b898bc7df Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 13:38:38 +0100
Subject: [PATCH 25/28] Fix file formatting

---
 examples/solve_sparse_system.py             | 9 +++++++--
 tests/test_sparse_solvers.py                | 5 +++++
 xobjects/sparse/_sparse.py                  | 5 +++++
 xobjects/sparse/_test_helpers.py            | 5 +++++
 xobjects/sparse/solvers/CUDA/_cuDSSLU.py    | 5 +++++
 xobjects/sparse/solvers/CUDA/_luLU.py       | 5 +++++
 xobjects/sparse/solvers/_abstract_solver.py | 5 +++++
 7 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/examples/solve_sparse_system.py b/examples/solve_sparse_system.py
index 4e02673..c9a17cb 100644
--- a/examples/solve_sparse_system.py
+++ b/examples/solve_sparse_system.py
@@ -1,3 +1,8 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2021.                   #
+# ########################################### #
+
 import xobjects as xo
 import scipy.sparse as sp
 import numpy as np
@@ -17,7 +22,7 @@
 solver(A) # Performs decomposition/factorization
 solver.solve(b) # Solves Ax = b using precomputed factors
 
-For optimal performance accross backends b should be a column-major (F Contiguous)
+For optimal performance across backends b should be a column-major (F Contiguous)
 array or vector.
 
 The intended interface for this module is:
@@ -34,7 +39,7 @@
 For development and convenience purposes xo.sparse provides the:
 xo.sparse.solvers module
 
-which provides the following options:
+which provides the following:
 xo.sparse.solvers.CPU.
     - scipysplu : Alias for scipy SuperLU 
     - KLUSuperLU : Alias for PyKLU
diff --git a/tests/test_sparse_solvers.py b/tests/test_sparse_solvers.py
index e490335..8fd77e8 100644
--- a/tests/test_sparse_solvers.py
+++ b/tests/test_sparse_solvers.py
@@ -1,3 +1,8 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2021.                   #
+# ########################################### #
+
 import numpy as np
 import scipy.sparse as sp
 import xobjects as xo
diff --git a/xobjects/sparse/_sparse.py b/xobjects/sparse/_sparse.py
index 0d7301c..9091970 100644
--- a/xobjects/sparse/_sparse.py
+++ b/xobjects/sparse/_sparse.py
@@ -1,3 +1,8 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2021.                   #
+# ########################################### #
+
 import scipy.sparse
 from numpy import ndarray as nparray
 from typing import Optional, Literal, Union
diff --git a/xobjects/sparse/_test_helpers.py b/xobjects/sparse/_test_helpers.py
index 313aa39..6b06875 100644
--- a/xobjects/sparse/_test_helpers.py
+++ b/xobjects/sparse/_test_helpers.py
@@ -1,3 +1,8 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2021.                   #
+# ########################################### #
+
 import numpy.linalg as npl
 import scipy.sparse.linalg as scspl
 from scipy.sparse import issparse
diff --git a/xobjects/sparse/solvers/CUDA/_cuDSSLU.py b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
index 3ca7bfa..31ded5e 100644
--- a/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
+++ b/xobjects/sparse/solvers/CUDA/_cuDSSLU.py
@@ -1,3 +1,8 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2021.                   #
+# ########################################### #
+
 import cupy as cp
 import cupyx.scipy.sparse as sp
 import nvmath
diff --git a/xobjects/sparse/solvers/CUDA/_luLU.py b/xobjects/sparse/solvers/CUDA/_luLU.py
index 4089ace..25665ec 100644
--- a/xobjects/sparse/solvers/CUDA/_luLU.py
+++ b/xobjects/sparse/solvers/CUDA/_luLU.py
@@ -1,3 +1,8 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2021.                   #
+# ########################################### #
+
 import cupy as _cupy
 import numpy as _numpy
 import cupyx.scipy.sparse
diff --git a/xobjects/sparse/solvers/_abstract_solver.py b/xobjects/sparse/solvers/_abstract_solver.py
index 8a6022c..9acfe10 100644
--- a/xobjects/sparse/solvers/_abstract_solver.py
+++ b/xobjects/sparse/solvers/_abstract_solver.py
@@ -1,3 +1,8 @@
+# copyright ################################# #
+# This file is part of the Xobjects Package.  #
+# Copyright (c) CERN, 2021.                   #
+# ########################################### #
+
 from abc import ABC, abstractmethod
 
 class SuperLUlikeSolver(ABC):

From 9f392e7887ea797b396d10f750007367b9e61887 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Tue, 2 Dec 2025 14:17:24 +0100
Subject: [PATCH 26/28] Refactor test functions

---
 tests/test_sparse_solvers.py     | 56 +++++++++++++++++++++++------
 xobjects/sparse/__init__.py      |  4 +--
 xobjects/sparse/_sparse.py       | 14 +++++++-
 xobjects/sparse/_test_helpers.py | 61 --------------------------------
 4 files changed, 60 insertions(+), 75 deletions(-)
 delete mode 100644 xobjects/sparse/_test_helpers.py

diff --git a/tests/test_sparse_solvers.py b/tests/test_sparse_solvers.py
index 8fd77e8..eb43633 100644
--- a/tests/test_sparse_solvers.py
+++ b/tests/test_sparse_solvers.py
@@ -7,7 +7,6 @@
 import scipy.sparse as sp
 import xobjects as xo
 from xobjects.test_helpers import fix_random_seed
-from xobjects.sparse import _test_helpers as sptest
 from xobjects.context import ModuleNotAvailableError
 import warnings
 import pytest
@@ -36,6 +35,41 @@
 but if testing larger systems, could potentially be omitted.
 '''
 
+# ---- Helper functions ----
+def issymmetric(A, tol=0):
+    if A.shape[0] != A.shape[1]:
+        return False
+    diff = A - A.T
+    if tol == 0:
+        return diff.nnz == 0
+    else:
+        # tolerance-based check
+        return abs(diff).max() <= tol
+
+
+def assert_residual_ok(res_ref, res_solver,
+                       abs_tol=1e-12,
+                       factor=10):
+    """
+    Check that our solver's residual is both:
+      - absolutely small enough (abs_tol),
+      - not catastrophically worse than the reference (factor * res_ref).
+    """
+    # sanity: reference solver itself should be good
+    assert res_ref < abs_tol, f"Reference residual too large: {res_ref}"
+
+    # absolute bound
+    assert res_solver < abs_tol, (
+        f"Residual {res_solver} exceeds absolute tolerance {abs_tol}"
+    )
+
+    # relative bound vs reference
+    assert res_solver <= factor * res_ref, (
+        f"Residual {res_solver} not within factor {factor} of "
+        f"reference residual {res_ref}"
+    )
+
+# ---- Tests ----
 cpu_tests = [
         ("scipySLU",  xo.ContextCpu()),
         ("PyKLU",     xo.ContextCpu()),
@@ -118,9 +152,9 @@ def make_tridiagonal_system(n, nbatches):
 @pytest.mark.parametrize("sparse_system", [random_system, tridiag_system])
 def test_vector_solve(test_solver, test_context, sparse_system):
     A_sp, b_sp, x_sp, _ = sparse_system
-    assert not sptest.issymmetric(A_sp)
+    assert not issymmetric(A_sp)
 
-    scipy_residual = sptest.rel_residual(A_sp,x_sp,b_sp)
+    scipy_residual = xo.sparse.rel_residual(A_sp,x_sp,b_sp)
     
     if "Cpu" in str(test_context):
         A = test_context.splike_lib.sparse.csc_matrix(A_sp)
@@ -134,9 +168,9 @@ def test_vector_solve(test_solver, test_context, sparse_system):
                                                 )
     x = solver.solve(b)
 
-    solver_residual = sptest.rel_residual(A,x,b)
-    sptest.assert_residual_ok(scipy_residual,solver_residual, 
-                              abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR)
+    solver_residual = xo.sparse.rel_residual(A,x,b)
+    assert_residual_ok(scipy_residual,solver_residual, 
+                        abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR)
 
 random_system = make_random_sparse_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES)
 tridiag_system = make_tridiagonal_system(SPARSE_SYSTEM_SIZE, NUM_BATCHES)
@@ -145,8 +179,8 @@ def test_vector_solve(test_solver, test_context, sparse_system):
 @pytest.mark.parametrize("sparse_system", [random_system, tridiag_system])
 def test_batched_solve(test_solver, test_context, sparse_system):
     A_sp, b_sp, x_sp, _ = sparse_system
-    assert not sptest.issymmetric(A_sp)
-    scipy_residual = sptest.rel_residual(A_sp,x_sp,b_sp)
+    assert not issymmetric(A_sp)
+    scipy_residual = xo.sparse.rel_residual(A_sp,x_sp,b_sp)
     if "Cpu" in str(test_context):
         A = test_context.splike_lib.sparse.csc_matrix(A_sp)
     if "Cupy" in str(test_context):
@@ -160,6 +194,6 @@ def test_batched_solve(test_solver, test_context, sparse_system):
                                                 )
     x = solver.solve(b)
 
-    solver_residual = sptest.rel_residual(A,x,b)
-    sptest.assert_residual_ok(scipy_residual,solver_residual, 
-                              abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR)
\ No newline at end of file
+    solver_residual = xo.sparse.rel_residual(A,x,b)
+    assert_residual_ok(scipy_residual,solver_residual, 
+                        abs_tol = ABS_TOL, factor = TOLERANCE_FACTOR)
\ No newline at end of file
diff --git a/xobjects/sparse/__init__.py b/xobjects/sparse/__init__.py
index 2fa42ef..9a4cca2 100644
--- a/xobjects/sparse/__init__.py
+++ b/xobjects/sparse/__init__.py
@@ -1,3 +1,3 @@
-from ._sparse import factorized_sparse_solver
+from ._sparse import factorized_sparse_solver, rel_residual
 from . import solvers
-__all__ = ["factorized_sparse_solver","solvers"]
+__all__ = ["factorized_sparse_solver","solvers", "rel_residual"]
diff --git a/xobjects/sparse/_sparse.py b/xobjects/sparse/_sparse.py
index 9091970..80f80f6 100644
--- a/xobjects/sparse/_sparse.py
+++ b/xobjects/sparse/_sparse.py
@@ -4,6 +4,7 @@
 # ########################################### #
 
 import scipy.sparse
+import numpy.linalg as npl
 from numpy import ndarray as nparray
 from typing import Optional, Literal, Union
 from ..context import XContext
@@ -330,4 +331,15 @@ def factorized_sparse_solver(A: Union[scipy.sparse.csr_matrix,
 
 def dbugprint(verbose: bool, text: str):
     if verbose:
-        _print("[xo.sparse] "+text)
\ No newline at end of file
+        _print("[xo.sparse] "+text)
+
+def rel_residual(A,x,b):
+    if hasattr(A, "get"):
+        A = A.get()
+    if hasattr(x, "get"):
+        x = x.get()
+    if hasattr(b, "get"):
+        b = b.get()
+    assert scipy.sparse.issparse(A), ("A must be a sparse matrix")
+
+    return npl.norm(A@x - b) / (npl.norm(b))
\ No newline at end of file
diff --git a/xobjects/sparse/_test_helpers.py b/xobjects/sparse/_test_helpers.py
deleted file mode 100644
index 6b06875..0000000
--- a/xobjects/sparse/_test_helpers.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# copyright ################################# #
-# This file is part of the Xobjects Package.  #
-# Copyright (c) CERN, 2021.                   #
-# ########################################### #
-
-import numpy.linalg as npl
-import scipy.sparse.linalg as scspl
-from scipy.sparse import issparse
-
-def issymmetric(A, tol=0):
-    if A.shape[0] != A.shape[1]:
-        return False
-    diff = A - A.T
-    if tol == 0:
-        return diff.nnz == 0
-    else:
-        # tolerance-based check
-        return abs(diff).max() <= tol
-
-def rel_residual(A,x,b):
-    if hasattr(A, "get"):
-        A = A.get()
-    if hasattr(x, "get"):
-        x = x.get()
-    if hasattr(b, "get"):
-        b = b.get()
-    assert issparse(A), ("A must be a sparse matrix")
-
-    return npl.norm(A@x - b) / (npl.norm(b))
-
-def assert_residual_ok(res_ref, res_solver,
-                       abs_tol=1e-12,
-                       factor=10):
-    """
-    Check that our solver's residual is both:
-      - absolutely small enough (abs_tol),
-      - not catastrophically worse than the reference (factor * res_ref).
-    """
-    # sanity: reference solver itself should be good
-    assert res_ref < abs_tol, f"Reference residual too large: {res_ref}"
-
-    # absolute bound
-    assert res_solver < abs_tol, (
-        f"Residual {res_solver} exceeds absolute tolerance {abs_tol}"
-    )
-
-    # relative bound vs reference
-    assert res_solver <= factor * res_ref, (
-        f"Residual {res_solver} not within factor {factor} of "
-        f"reference residual {res_ref}"
-    )
-
-# ---- Unused rn, but could be useful for tests ----
-
-def assert_close_to_precision(value, precision):
-    assert value <= precision, (f"Value {value} not within precision {precision}")
-
-def assert_residual_close(reference_residual, residual, tolerance = 10):
-    assert residual <= tolerance*reference_residual, (
-        f"Residual {residual} not within tolerance "
-        f"O({tolerance}) of reference residual {reference_residual}")
\ No newline at end of file

From 8727a3e73bae8b25bc8c64bdaf565d48cdcdddae Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Thu, 4 Dec 2025 13:02:55 +0100
Subject: [PATCH 27/28] Add optional dependencies for sparse in pyproject.toml

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index d64975b..f0bd6ae 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,8 @@ Documentation = "https://xsuite.readthedocs.io/"
 
 [project.optional-dependencies]
 tests = ["pytest", "pytest-mock"]
+sparse = ["pyklu"]
+sparseGPU = ["nvmath-python"]
 
 [tool.setuptools.packages.find]
 where = ["."]

From 11813578a2851f1f579bf88e4461f873a3c1dc07 Mon Sep 17 00:00:00 2001
From: Evangelos Katralis <vkatralis@gmail.com>
Date: Fri, 5 Dec 2025 14:47:37 +0100
Subject: [PATCH 28/28] Fix example

---
 examples/solve_sparse_system.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/solve_sparse_system.py b/examples/solve_sparse_system.py
index c9a17cb..57c346a 100644
--- a/examples/solve_sparse_system.py
+++ b/examples/solve_sparse_system.py
@@ -6,7 +6,7 @@
 import xobjects as xo
 import scipy.sparse as sp
 import numpy as np
-from xobjects.sparse._test_helpers import rel_residual
+from xobjects.sparse import rel_residual
 
 '''
 The goal of this example is to provide a short user guide for the xo.sparse module.