initial commit

2025-08-19 09:13:22 -07:00
parent 28464811d6
commit 0977a3e14d
820 changed files with 1003358 additions and 2 deletions
--- a/mne/cuda.py
+++ b/mne/cuda.py
@@ -0,0 +1,390 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import numpy as np
+from scipy.fft import irfft, rfft
+
+from .utils import (
+    _check_option,
+    _explain_exception,
+    fill_doc,
+    get_config,
+    logger,
+    sizeof_fmt,
+    verbose,
+    warn,
+)
+
+_cuda_capable = False
+
+
+def get_cuda_memory(kind="available"):
+    """Get the amount of free memory for CUDA operations.
+
+    Parameters
+    ----------
+    kind : str
+        Can be ``"available"`` or ``"total"``.
+
+    Returns
+    -------
+    memory : str
+        The amount of available or total memory as a human-readable string.
+    """
+    if not _cuda_capable:
+        warn("CUDA not enabled, returning zero for memory")
+        mem = 0
+    else:
+        import cupy
+
+        mem = cupy.cuda.runtime.memGetInfo()[dict(available=0, total=1)[kind]]
+    return sizeof_fmt(mem)
+
+
+@verbose
+def init_cuda(ignore_config=False, verbose=None):
+    """Initialize CUDA functionality.
+
+    This function attempts to load the necessary interfaces
+    (hardware connectivity) to run CUDA-based filtering. This
+    function should only need to be run once per session.
+
+    If the config var (set via mne.set_config or in ENV)
+    MNE_USE_CUDA == 'true', this function will be executed when
+    the first CUDA setup is performed. If this variable is not
+    set, this function can be manually executed.
+
+    Parameters
+    ----------
+    ignore_config : bool
+        If True, ignore the config value MNE_USE_CUDA and force init.
+    %(verbose)s
+    """
+    global _cuda_capable
+    if _cuda_capable:
+        return
+    if not ignore_config and (get_config("MNE_USE_CUDA", "false").lower() != "true"):
+        logger.info("CUDA not enabled in config, skipping initialization")
+        return
+    # Triage possible errors for informative messaging
+    _cuda_capable = False
+    try:
+        import cupy  # noqa
+    except ImportError:
+        warn("module cupy not found, CUDA not enabled")
+        return
+    device_id = int(get_config("MNE_CUDA_DEVICE", "0"))
+    try:
+        # Initialize CUDA
+        _set_cuda_device(device_id, verbose)
+    except Exception:
+        warn(
+            "so CUDA device could be initialized, likely a hardware error, "
+            f"CUDA not enabled{_explain_exception()}"
+        )
+        return
+
+    _cuda_capable = True
+    # Figure out limit for CUDA FFT calculations
+    logger.info(f"Enabling CUDA with {get_cuda_memory()} available memory")
+
+
+@verbose
+def set_cuda_device(device_id, verbose=None):
+    """Set the CUDA device temporarily for the current session.
+
+    Parameters
+    ----------
+    device_id : int
+        Numeric ID of the CUDA-capable device you want MNE-Python to use.
+    %(verbose)s
+    """
+    if _cuda_capable:
+        _set_cuda_device(device_id, verbose)
+    elif get_config("MNE_USE_CUDA", "false").lower() == "true":
+        init_cuda()
+        _set_cuda_device(device_id, verbose)
+    else:
+        warn(
+            "Could not set CUDA device because CUDA is not enabled; either "
+            "run mne.cuda.init_cuda() first, or set the MNE_USE_CUDA config "
+            'variable to "true".'
+        )
+
+
+@verbose
+def _set_cuda_device(device_id, verbose=None):
+    """Set the CUDA device."""
+    import cupy
+
+    cupy.cuda.Device(device_id).use()
+    logger.info(f"Now using CUDA device {device_id}")
+
+
+###############################################################################
+# Repeated FFT multiplication
+
+
+def _setup_cuda_fft_multiply_repeated(n_jobs, h, n_fft, kind="FFT FIR filtering"):
+    """Set up repeated CUDA FFT multiplication with a given filter.
+
+    Parameters
+    ----------
+    n_jobs : int | str
+        If ``n_jobs='cuda'``, the function will attempt to set up for CUDA
+        FFT multiplication.
+    h : array
+        The filtering function that will be used repeatedly.
+    n_fft : int
+        The number of points in the FFT.
+    kind : str
+        The kind to report to the user.
+
+    Returns
+    -------
+    n_jobs : int
+        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
+        original n_jobs is passed.
+    cuda_dict : dict
+        Dictionary with the following CUDA-related variables:
+            use_cuda : bool
+                Whether CUDA should be used.
+            fft_plan : instance of FFTPlan
+                FFT plan to use in calculating the FFT.
+            ifft_plan : instance of FFTPlan
+                FFT plan to use in calculating the IFFT.
+            x_fft : instance of gpuarray
+                Empty allocated GPU space for storing the result of the
+                frequency-domain multiplication.
+            x : instance of gpuarray
+                Empty allocated GPU space for the data to filter.
+    h_fft : array | instance of gpuarray
+        This will either be a gpuarray (if CUDA enabled) or ndarray.
+
+    Notes
+    -----
+    This function is designed to be used with fft_multiply_repeated().
+    """
+    cuda_dict = dict(n_fft=n_fft, rfft=rfft, irfft=irfft, h_fft=rfft(h, n=n_fft))
+    if isinstance(n_jobs, str):
+        _check_option("n_jobs", n_jobs, ("cuda",))
+        n_jobs = 1
+        init_cuda()
+        if _cuda_capable:
+            import cupy
+
+            try:
+                # do the IFFT normalization now so we don't have to later
+                h_fft = cupy.array(cuda_dict["h_fft"])
+                logger.info(f"Using CUDA for {kind}")
+            except Exception as exp:
+                logger.info(
+                    "CUDA not used, could not instantiate memory (arrays may be too "
+                    f'large: "{exp}"), falling back to n_jobs=None'
+                )
+            cuda_dict.update(h_fft=h_fft, rfft=_cuda_upload_rfft, irfft=_cuda_irfft_get)
+        else:
+            logger.info(
+                "CUDA not used, CUDA could not be initialized, "
+                "falling back to n_jobs=None"
+            )
+    return n_jobs, cuda_dict
+
+
+def _fft_multiply_repeated(x, cuda_dict):
+    """Do FFT multiplication by a filter function (possibly using CUDA).
+
+    Parameters
+    ----------
+    h_fft : 1-d array or gpuarray
+        The filtering array to apply.
+    x : 1-d array
+        The array to filter.
+    n_fft : int
+        The number of points in the FFT.
+    cuda_dict : dict
+        Dictionary constructed using setup_cuda_multiply_repeated().
+
+    Returns
+    -------
+    x : 1-d array
+        Filtered version of x.
+    """
+    # do the fourier-domain operations
+    x_fft = cuda_dict["rfft"](x, cuda_dict["n_fft"])
+    x_fft *= cuda_dict["h_fft"]
+    x = cuda_dict["irfft"](x_fft, cuda_dict["n_fft"])
+    return x
+
+
+###############################################################################
+# FFT Resampling
+
+
+def _setup_cuda_fft_resample(n_jobs, W, new_len):
+    """Set up CUDA FFT resampling.
+
+    Parameters
+    ----------
+    n_jobs : int | str
+        If n_jobs == 'cuda', the function will attempt to set up for CUDA
+        FFT resampling.
+    W : array
+        The filtering function to be used during resampling.
+        If n_jobs='cuda', this function will be shortened (since CUDA
+        assumes FFTs of real signals are half the length of the signal)
+        and turned into a gpuarray.
+    new_len : int
+        The size of the array following resampling.
+
+    Returns
+    -------
+    n_jobs : int
+        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
+        original n_jobs is passed.
+    cuda_dict : dict
+        Dictionary with the following CUDA-related variables:
+            use_cuda : bool
+                Whether CUDA should be used.
+            fft_plan : instance of FFTPlan
+                FFT plan to use in calculating the FFT.
+            ifft_plan : instance of FFTPlan
+                FFT plan to use in calculating the IFFT.
+            x_fft : instance of gpuarray
+                Empty allocated GPU space for storing the result of the
+                frequency-domain multiplication.
+            x : instance of gpuarray
+                Empty allocated GPU space for the data to resample.
+
+    Notes
+    -----
+    This function is designed to be used with fft_resample().
+    """
+    cuda_dict = dict(use_cuda=False, rfft=rfft, irfft=irfft)
+    rfft_len_x = len(W) // 2 + 1
+    # fold the window onto inself (should be symmetric) and truncate
+    W = W.copy()
+    W[1:rfft_len_x] = (W[1:rfft_len_x] + W[::-1][: rfft_len_x - 1]) / 2.0
+    W = W[:rfft_len_x]
+    if isinstance(n_jobs, str):
+        _check_option("n_jobs", n_jobs, ("cuda",))
+        n_jobs = 1
+        init_cuda()
+        if _cuda_capable:
+            try:
+                import cupy
+
+                # do the IFFT normalization now so we don't have to later
+                W = cupy.array(W)
+                logger.info("Using CUDA for FFT resampling")
+            except Exception:
+                logger.info(
+                    "CUDA not used, could not instantiate memory "
+                    "(arrays may be too large), falling back to "
+                    "n_jobs=None"
+                )
+            else:
+                cuda_dict.update(
+                    use_cuda=True, rfft=_cuda_upload_rfft, irfft=_cuda_irfft_get
+                )
+        else:
+            logger.info(
+                "CUDA not used, CUDA could not be initialized, "
+                "falling back to n_jobs=None"
+            )
+    cuda_dict["W"] = W
+    return n_jobs, cuda_dict
+
+
+def _cuda_upload_rfft(x, n, axis=-1):
+    """Upload and compute rfft."""
+    import cupy
+
+    return cupy.fft.rfft(cupy.array(x), n=n, axis=axis)
+
+
+def _cuda_irfft_get(x, n, axis=-1):
+    """Compute irfft and get."""
+    import cupy
+
+    return cupy.fft.irfft(x, n=n, axis=axis).get()
+
+
+@fill_doc
+def _fft_resample(x, new_len, npads, to_removes, cuda_dict=None, pad="reflect_limited"):
+    """Do FFT resampling with a filter function (possibly using CUDA).
+
+    Parameters
+    ----------
+    x : 1-d array
+        The array to resample. Will be converted to float64 if necessary.
+    new_len : int
+        The size of the output array (before removing padding).
+    npads : tuple of int
+        Amount of padding to apply to the start and end of the
+        signal before resampling.
+    to_removes : tuple of int
+        Number of samples to remove after resampling.
+    cuda_dict : dict
+        Dictionary constructed using setup_cuda_multiply_repeated().
+    %(pad_resample)s
+        The default is ``'reflect_limited'``.
+
+        .. versionadded:: 0.15
+
+    Returns
+    -------
+    x : 1-d array
+        Filtered version of x.
+    """
+    cuda_dict = dict(use_cuda=False) if cuda_dict is None else cuda_dict
+    # add some padding at beginning and end to make this work a little cleaner
+    if x.dtype != np.float64:
+        x = x.astype(np.float64)
+    x = _smart_pad(x, npads, pad)
+    old_len = len(x)
+    shorter = new_len < old_len
+    use_len = new_len if shorter else old_len
+    x_fft = cuda_dict["rfft"](x, None)
+    if use_len % 2 == 0:
+        nyq = use_len // 2
+        x_fft[nyq : nyq + 1] *= 2 if shorter else 0.5
+    x_fft *= cuda_dict["W"]
+    y = cuda_dict["irfft"](x_fft, new_len)
+
+    # now let's trim it back to the correct size (if there was padding)
+    if (to_removes > 0).any():
+        y = y[to_removes[0] : y.shape[0] - to_removes[1]]
+
+    return y
+
+
+###############################################################################
+# Misc
+
+
+# this has to go in mne.cuda instead of mne.filter to avoid import errors
+def _smart_pad(x, n_pad, pad="reflect_limited"):
+    """Pad vector x."""
+    n_pad = np.asarray(n_pad)
+    assert n_pad.shape == (2,)
+    if (n_pad == 0).all():
+        return x
+    elif (n_pad < 0).any():
+        raise RuntimeError("n_pad must be non-negative")
+    if pad == "reflect_limited":
+        # need to pad with zeros if len(x) <= npad
+        l_z_pad = np.zeros(max(n_pad[0] - len(x) + 1, 0), dtype=x.dtype)
+        r_z_pad = np.zeros(max(n_pad[1] - len(x) + 1, 0), dtype=x.dtype)
+        return np.concatenate(
+            [
+                l_z_pad,
+                2 * x[0] - x[n_pad[0] : 0 : -1],
+                x,
+                2 * x[-1] - x[-2 : -n_pad[1] - 2 : -1],
+                r_z_pad,
+            ]
+        )
+    else:
+        return np.pad(x, (tuple(n_pad),), pad)