initial commit
This commit is contained in:
390
mne/cuda.py
Normal file
390
mne/cuda.py
Normal file
@@ -0,0 +1,390 @@
|
||||
# Authors: The MNE-Python contributors.
|
||||
# License: BSD-3-Clause
|
||||
# Copyright the MNE-Python contributors.
|
||||
|
||||
import numpy as np
|
||||
from scipy.fft import irfft, rfft
|
||||
|
||||
from .utils import (
|
||||
_check_option,
|
||||
_explain_exception,
|
||||
fill_doc,
|
||||
get_config,
|
||||
logger,
|
||||
sizeof_fmt,
|
||||
verbose,
|
||||
warn,
|
||||
)
|
||||
|
||||
_cuda_capable = False
|
||||
|
||||
|
||||
def get_cuda_memory(kind="available"):
|
||||
"""Get the amount of free memory for CUDA operations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kind : str
|
||||
Can be ``"available"`` or ``"total"``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
memory : str
|
||||
The amount of available or total memory as a human-readable string.
|
||||
"""
|
||||
if not _cuda_capable:
|
||||
warn("CUDA not enabled, returning zero for memory")
|
||||
mem = 0
|
||||
else:
|
||||
import cupy
|
||||
|
||||
mem = cupy.cuda.runtime.memGetInfo()[dict(available=0, total=1)[kind]]
|
||||
return sizeof_fmt(mem)
|
||||
|
||||
|
||||
@verbose
|
||||
def init_cuda(ignore_config=False, verbose=None):
|
||||
"""Initialize CUDA functionality.
|
||||
|
||||
This function attempts to load the necessary interfaces
|
||||
(hardware connectivity) to run CUDA-based filtering. This
|
||||
function should only need to be run once per session.
|
||||
|
||||
If the config var (set via mne.set_config or in ENV)
|
||||
MNE_USE_CUDA == 'true', this function will be executed when
|
||||
the first CUDA setup is performed. If this variable is not
|
||||
set, this function can be manually executed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ignore_config : bool
|
||||
If True, ignore the config value MNE_USE_CUDA and force init.
|
||||
%(verbose)s
|
||||
"""
|
||||
global _cuda_capable
|
||||
if _cuda_capable:
|
||||
return
|
||||
if not ignore_config and (get_config("MNE_USE_CUDA", "false").lower() != "true"):
|
||||
logger.info("CUDA not enabled in config, skipping initialization")
|
||||
return
|
||||
# Triage possible errors for informative messaging
|
||||
_cuda_capable = False
|
||||
try:
|
||||
import cupy # noqa
|
||||
except ImportError:
|
||||
warn("module cupy not found, CUDA not enabled")
|
||||
return
|
||||
device_id = int(get_config("MNE_CUDA_DEVICE", "0"))
|
||||
try:
|
||||
# Initialize CUDA
|
||||
_set_cuda_device(device_id, verbose)
|
||||
except Exception:
|
||||
warn(
|
||||
"so CUDA device could be initialized, likely a hardware error, "
|
||||
f"CUDA not enabled{_explain_exception()}"
|
||||
)
|
||||
return
|
||||
|
||||
_cuda_capable = True
|
||||
# Figure out limit for CUDA FFT calculations
|
||||
logger.info(f"Enabling CUDA with {get_cuda_memory()} available memory")
|
||||
|
||||
|
||||
@verbose
|
||||
def set_cuda_device(device_id, verbose=None):
|
||||
"""Set the CUDA device temporarily for the current session.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
device_id : int
|
||||
Numeric ID of the CUDA-capable device you want MNE-Python to use.
|
||||
%(verbose)s
|
||||
"""
|
||||
if _cuda_capable:
|
||||
_set_cuda_device(device_id, verbose)
|
||||
elif get_config("MNE_USE_CUDA", "false").lower() == "true":
|
||||
init_cuda()
|
||||
_set_cuda_device(device_id, verbose)
|
||||
else:
|
||||
warn(
|
||||
"Could not set CUDA device because CUDA is not enabled; either "
|
||||
"run mne.cuda.init_cuda() first, or set the MNE_USE_CUDA config "
|
||||
'variable to "true".'
|
||||
)
|
||||
|
||||
|
||||
@verbose
|
||||
def _set_cuda_device(device_id, verbose=None):
|
||||
"""Set the CUDA device."""
|
||||
import cupy
|
||||
|
||||
cupy.cuda.Device(device_id).use()
|
||||
logger.info(f"Now using CUDA device {device_id}")
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Repeated FFT multiplication
|
||||
|
||||
|
||||
def _setup_cuda_fft_multiply_repeated(n_jobs, h, n_fft, kind="FFT FIR filtering"):
|
||||
"""Set up repeated CUDA FFT multiplication with a given filter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_jobs : int | str
|
||||
If ``n_jobs='cuda'``, the function will attempt to set up for CUDA
|
||||
FFT multiplication.
|
||||
h : array
|
||||
The filtering function that will be used repeatedly.
|
||||
n_fft : int
|
||||
The number of points in the FFT.
|
||||
kind : str
|
||||
The kind to report to the user.
|
||||
|
||||
Returns
|
||||
-------
|
||||
n_jobs : int
|
||||
Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
|
||||
original n_jobs is passed.
|
||||
cuda_dict : dict
|
||||
Dictionary with the following CUDA-related variables:
|
||||
use_cuda : bool
|
||||
Whether CUDA should be used.
|
||||
fft_plan : instance of FFTPlan
|
||||
FFT plan to use in calculating the FFT.
|
||||
ifft_plan : instance of FFTPlan
|
||||
FFT plan to use in calculating the IFFT.
|
||||
x_fft : instance of gpuarray
|
||||
Empty allocated GPU space for storing the result of the
|
||||
frequency-domain multiplication.
|
||||
x : instance of gpuarray
|
||||
Empty allocated GPU space for the data to filter.
|
||||
h_fft : array | instance of gpuarray
|
||||
This will either be a gpuarray (if CUDA enabled) or ndarray.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function is designed to be used with fft_multiply_repeated().
|
||||
"""
|
||||
cuda_dict = dict(n_fft=n_fft, rfft=rfft, irfft=irfft, h_fft=rfft(h, n=n_fft))
|
||||
if isinstance(n_jobs, str):
|
||||
_check_option("n_jobs", n_jobs, ("cuda",))
|
||||
n_jobs = 1
|
||||
init_cuda()
|
||||
if _cuda_capable:
|
||||
import cupy
|
||||
|
||||
try:
|
||||
# do the IFFT normalization now so we don't have to later
|
||||
h_fft = cupy.array(cuda_dict["h_fft"])
|
||||
logger.info(f"Using CUDA for {kind}")
|
||||
except Exception as exp:
|
||||
logger.info(
|
||||
"CUDA not used, could not instantiate memory (arrays may be too "
|
||||
f'large: "{exp}"), falling back to n_jobs=None'
|
||||
)
|
||||
cuda_dict.update(h_fft=h_fft, rfft=_cuda_upload_rfft, irfft=_cuda_irfft_get)
|
||||
else:
|
||||
logger.info(
|
||||
"CUDA not used, CUDA could not be initialized, "
|
||||
"falling back to n_jobs=None"
|
||||
)
|
||||
return n_jobs, cuda_dict
|
||||
|
||||
|
||||
def _fft_multiply_repeated(x, cuda_dict):
|
||||
"""Do FFT multiplication by a filter function (possibly using CUDA).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
h_fft : 1-d array or gpuarray
|
||||
The filtering array to apply.
|
||||
x : 1-d array
|
||||
The array to filter.
|
||||
n_fft : int
|
||||
The number of points in the FFT.
|
||||
cuda_dict : dict
|
||||
Dictionary constructed using setup_cuda_multiply_repeated().
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : 1-d array
|
||||
Filtered version of x.
|
||||
"""
|
||||
# do the fourier-domain operations
|
||||
x_fft = cuda_dict["rfft"](x, cuda_dict["n_fft"])
|
||||
x_fft *= cuda_dict["h_fft"]
|
||||
x = cuda_dict["irfft"](x_fft, cuda_dict["n_fft"])
|
||||
return x
|
||||
|
||||
|
||||
###############################################################################
|
||||
# FFT Resampling
|
||||
|
||||
|
||||
def _setup_cuda_fft_resample(n_jobs, W, new_len):
|
||||
"""Set up CUDA FFT resampling.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_jobs : int | str
|
||||
If n_jobs == 'cuda', the function will attempt to set up for CUDA
|
||||
FFT resampling.
|
||||
W : array
|
||||
The filtering function to be used during resampling.
|
||||
If n_jobs='cuda', this function will be shortened (since CUDA
|
||||
assumes FFTs of real signals are half the length of the signal)
|
||||
and turned into a gpuarray.
|
||||
new_len : int
|
||||
The size of the array following resampling.
|
||||
|
||||
Returns
|
||||
-------
|
||||
n_jobs : int
|
||||
Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
|
||||
original n_jobs is passed.
|
||||
cuda_dict : dict
|
||||
Dictionary with the following CUDA-related variables:
|
||||
use_cuda : bool
|
||||
Whether CUDA should be used.
|
||||
fft_plan : instance of FFTPlan
|
||||
FFT plan to use in calculating the FFT.
|
||||
ifft_plan : instance of FFTPlan
|
||||
FFT plan to use in calculating the IFFT.
|
||||
x_fft : instance of gpuarray
|
||||
Empty allocated GPU space for storing the result of the
|
||||
frequency-domain multiplication.
|
||||
x : instance of gpuarray
|
||||
Empty allocated GPU space for the data to resample.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function is designed to be used with fft_resample().
|
||||
"""
|
||||
cuda_dict = dict(use_cuda=False, rfft=rfft, irfft=irfft)
|
||||
rfft_len_x = len(W) // 2 + 1
|
||||
# fold the window onto inself (should be symmetric) and truncate
|
||||
W = W.copy()
|
||||
W[1:rfft_len_x] = (W[1:rfft_len_x] + W[::-1][: rfft_len_x - 1]) / 2.0
|
||||
W = W[:rfft_len_x]
|
||||
if isinstance(n_jobs, str):
|
||||
_check_option("n_jobs", n_jobs, ("cuda",))
|
||||
n_jobs = 1
|
||||
init_cuda()
|
||||
if _cuda_capable:
|
||||
try:
|
||||
import cupy
|
||||
|
||||
# do the IFFT normalization now so we don't have to later
|
||||
W = cupy.array(W)
|
||||
logger.info("Using CUDA for FFT resampling")
|
||||
except Exception:
|
||||
logger.info(
|
||||
"CUDA not used, could not instantiate memory "
|
||||
"(arrays may be too large), falling back to "
|
||||
"n_jobs=None"
|
||||
)
|
||||
else:
|
||||
cuda_dict.update(
|
||||
use_cuda=True, rfft=_cuda_upload_rfft, irfft=_cuda_irfft_get
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"CUDA not used, CUDA could not be initialized, "
|
||||
"falling back to n_jobs=None"
|
||||
)
|
||||
cuda_dict["W"] = W
|
||||
return n_jobs, cuda_dict
|
||||
|
||||
|
||||
def _cuda_upload_rfft(x, n, axis=-1):
|
||||
"""Upload and compute rfft."""
|
||||
import cupy
|
||||
|
||||
return cupy.fft.rfft(cupy.array(x), n=n, axis=axis)
|
||||
|
||||
|
||||
def _cuda_irfft_get(x, n, axis=-1):
|
||||
"""Compute irfft and get."""
|
||||
import cupy
|
||||
|
||||
return cupy.fft.irfft(x, n=n, axis=axis).get()
|
||||
|
||||
|
||||
@fill_doc
|
||||
def _fft_resample(x, new_len, npads, to_removes, cuda_dict=None, pad="reflect_limited"):
|
||||
"""Do FFT resampling with a filter function (possibly using CUDA).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : 1-d array
|
||||
The array to resample. Will be converted to float64 if necessary.
|
||||
new_len : int
|
||||
The size of the output array (before removing padding).
|
||||
npads : tuple of int
|
||||
Amount of padding to apply to the start and end of the
|
||||
signal before resampling.
|
||||
to_removes : tuple of int
|
||||
Number of samples to remove after resampling.
|
||||
cuda_dict : dict
|
||||
Dictionary constructed using setup_cuda_multiply_repeated().
|
||||
%(pad_resample)s
|
||||
The default is ``'reflect_limited'``.
|
||||
|
||||
.. versionadded:: 0.15
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : 1-d array
|
||||
Filtered version of x.
|
||||
"""
|
||||
cuda_dict = dict(use_cuda=False) if cuda_dict is None else cuda_dict
|
||||
# add some padding at beginning and end to make this work a little cleaner
|
||||
if x.dtype != np.float64:
|
||||
x = x.astype(np.float64)
|
||||
x = _smart_pad(x, npads, pad)
|
||||
old_len = len(x)
|
||||
shorter = new_len < old_len
|
||||
use_len = new_len if shorter else old_len
|
||||
x_fft = cuda_dict["rfft"](x, None)
|
||||
if use_len % 2 == 0:
|
||||
nyq = use_len // 2
|
||||
x_fft[nyq : nyq + 1] *= 2 if shorter else 0.5
|
||||
x_fft *= cuda_dict["W"]
|
||||
y = cuda_dict["irfft"](x_fft, new_len)
|
||||
|
||||
# now let's trim it back to the correct size (if there was padding)
|
||||
if (to_removes > 0).any():
|
||||
y = y[to_removes[0] : y.shape[0] - to_removes[1]]
|
||||
|
||||
return y
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Misc
|
||||
|
||||
|
||||
# this has to go in mne.cuda instead of mne.filter to avoid import errors
|
||||
def _smart_pad(x, n_pad, pad="reflect_limited"):
|
||||
"""Pad vector x."""
|
||||
n_pad = np.asarray(n_pad)
|
||||
assert n_pad.shape == (2,)
|
||||
if (n_pad == 0).all():
|
||||
return x
|
||||
elif (n_pad < 0).any():
|
||||
raise RuntimeError("n_pad must be non-negative")
|
||||
if pad == "reflect_limited":
|
||||
# need to pad with zeros if len(x) <= npad
|
||||
l_z_pad = np.zeros(max(n_pad[0] - len(x) + 1, 0), dtype=x.dtype)
|
||||
r_z_pad = np.zeros(max(n_pad[1] - len(x) + 1, 0), dtype=x.dtype)
|
||||
return np.concatenate(
|
||||
[
|
||||
l_z_pad,
|
||||
2 * x[0] - x[n_pad[0] : 0 : -1],
|
||||
x,
|
||||
2 * x[-1] - x[-2 : -n_pad[1] - 2 : -1],
|
||||
r_z_pad,
|
||||
]
|
||||
)
|
||||
else:
|
||||
return np.pad(x, (tuple(n_pad),), pad)
|
||||
Reference in New Issue
Block a user