initial commit

2025-08-19 09:13:22 -07:00
parent 28464811d6
commit 0977a3e14d
820 changed files with 1003358 additions and 2 deletions
--- a/mne/decoding/init.py
+++ b/mne/decoding/init.py
@@ -0,0 +1,8 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+"""Decoding and encoding, including machine learning and receptive fields."""
+import lazy_loader as lazy
+
+(__getattr__, __dir__, __all__) = lazy.attach_stub(__name__, __file__)
--- a/mne/decoding/init.pyi
+++ b/mne/decoding/init.pyi
@@ -0,0 +1,45 @@
+__all__ = [
+    "BaseEstimator",
+    "CSP",
+    "EMS",
+    "FilterEstimator",
+    "GeneralizingEstimator",
+    "LinearModel",
+    "PSDEstimator",
+    "ReceptiveField",
+    "SPoC",
+    "SSD",
+    "Scaler",
+    "SlidingEstimator",
+    "TemporalFilter",
+    "TimeDelayingRidge",
+    "TimeFrequency",
+    "TransformerMixin",
+    "UnsupervisedSpatialFilter",
+    "Vectorizer",
+    "compute_ems",
+    "cross_val_multiscore",
+    "get_coef",
+]
+from .base import (
+    BaseEstimator,
+    LinearModel,
+    TransformerMixin,
+    cross_val_multiscore,
+    get_coef,
+)
+from .csp import CSP, SPoC
+from .ems import EMS, compute_ems
+from .receptive_field import ReceptiveField
+from .search_light import GeneralizingEstimator, SlidingEstimator
+from .ssd import SSD
+from .time_delaying_ridge import TimeDelayingRidge
+from .time_frequency import TimeFrequency
+from .transformer import (
+    FilterEstimator,
+    PSDEstimator,
+    Scaler,
+    TemporalFilter,
+    UnsupervisedSpatialFilter,
+    Vectorizer,
+)
--- a/mne/decoding/base.py
+++ b/mne/decoding/base.py
@@ -0,0 +1,528 @@
+"""Base class copy from sklearn.base."""
+
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import datetime as dt
+import numbers
+
+import numpy as np
+from sklearn import model_selection as models
+from sklearn.base import (  # noqa: F401
+    BaseEstimator,
+    MetaEstimatorMixin,
+    TransformerMixin,
+    clone,
+    is_classifier,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import check_scoring
+from sklearn.model_selection import KFold, StratifiedKFold, check_cv
+from sklearn.utils import check_array, indexable
+
+from ..parallel import parallel_func
+from ..utils import _pl, logger, verbose, warn
+
+
+class LinearModel(MetaEstimatorMixin, BaseEstimator):
+    """Compute and store patterns from linear models.
+
+    The linear model coefficients (filters) are used to extract discriminant
+    neural sources from the measured data. This class computes the
+    corresponding patterns of these linear filters to make them more
+    interpretable :footcite:`HaufeEtAl2014`.
+
+    Parameters
+    ----------
+    model : object | None
+        A linear model from scikit-learn with a fit method
+        that updates a ``coef_`` attribute.
+        If None the model will be LogisticRegression.
+
+    Attributes
+    ----------
+    filters_ : ndarray, shape ([n_targets], n_features)
+        If fit, the filters used to decompose the data.
+    patterns_ : ndarray, shape ([n_targets], n_features)
+        If fit, the patterns used to restore M/EEG signals.
+
+    See Also
+    --------
+    CSP
+    mne.preprocessing.ICA
+    mne.preprocessing.Xdawn
+
+    Notes
+    -----
+    .. versionadded:: 0.10
+
+    References
+    ----------
+    .. footbibliography::
+    """
+
+    # TODO: Properly refactor this using
+    # https://github.com/scikit-learn/scikit-learn/issues/30237#issuecomment-2465572885
+    _model_attr_wrap = (
+        "transform",
+        "predict",
+        "predict_proba",
+        "_estimator_type",
+        "__tags__",
+        "decision_function",
+        "score",
+        "classes_",
+    )
+
+    def __init__(self, model=None):
+        if model is None:
+            model = LogisticRegression(solver="liblinear")
+
+        self.model = model
+
+    def __sklearn_tags__(self):
+        """Get sklearn tags."""
+        from sklearn.utils import get_tags  # added in 1.6
+
+        return get_tags(self.model)
+
+    def __getattr__(self, attr):
+        """Wrap to model for some attributes."""
+        if attr in LinearModel._model_attr_wrap:
+            return getattr(self.model, attr)
+        elif attr == "fit_transform" and hasattr(self.model, "fit_transform"):
+            return super().__getattr__(self, "_fit_transform")
+        return super().__getattr__(self, attr)
+
+    def _fit_transform(self, X, y):
+        return self.fit(X, y).transform(X)
+
+    def fit(self, X, y, **fit_params):
+        """Estimate the coefficients of the linear model.
+
+        Save the coefficients in the attribute ``filters_`` and
+        computes the attribute ``patterns_``.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_features)
+            The training input samples to estimate the linear coefficients.
+        y : array, shape (n_samples, [n_targets])
+            The target values.
+        **fit_params : dict of string -> object
+            Parameters to pass to the fit method of the estimator.
+
+        Returns
+        -------
+        self : instance of LinearModel
+            Returns the modified instance.
+        """
+        X = check_array(X, input_name="X")
+        if y is not None:
+            y = check_array(y, dtype=None, ensure_2d=False, input_name="y")
+            if y.ndim > 2:
+                raise ValueError(
+                    f"LinearModel only accepts up to 2-dimensional y, got {y.shape} "
+                    "instead."
+                )
+
+        # fit the Model
+        self.model.fit(X, y, **fit_params)
+
+        # Computes patterns using Haufe's trick: A = Cov_X . W . Precision_Y
+
+        inv_Y = 1.0
+        X = X - X.mean(0, keepdims=True)
+        if y.ndim == 2 and y.shape[1] != 1:
+            y = y - y.mean(0, keepdims=True)
+            inv_Y = np.linalg.pinv(np.cov(y.T))
+        self.patterns_ = np.cov(X.T).dot(self.filters_.T.dot(inv_Y)).T
+
+        return self
+
+    @property
+    def filters_(self):
+        if hasattr(self.model, "coef_"):
+            # Standard Linear Model
+            filters = self.model.coef_
+        elif hasattr(self.model.best_estimator_, "coef_"):
+            # Linear Model with GridSearchCV
+            filters = self.model.best_estimator_.coef_
+        else:
+            raise ValueError("model does not have a `coef_` attribute.")
+        if filters.ndim == 2 and filters.shape[0] == 1:
+            filters = filters[0]
+        return filters
+
+
+def _set_cv(cv, estimator=None, X=None, y=None):
+    """Set the default CV depending on whether clf is classifier/regressor."""
+    # Detect whether classification or regression
+
+    if estimator in ["classifier", "regressor"]:
+        est_is_classifier = estimator == "classifier"
+    else:
+        est_is_classifier = is_classifier(estimator)
+    # Setup CV
+    if isinstance(cv, int | np.int64):
+        XFold = StratifiedKFold if est_is_classifier else KFold
+        cv = XFold(n_splits=cv)
+    elif isinstance(cv, str):
+        if not hasattr(models, cv):
+            raise ValueError("Unknown cross-validation")
+        cv = getattr(models, cv)
+        cv = cv()
+    cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
+
+    # Extract train and test set to retrieve them at predict time
+    cv_splits = [(train, test) for train, test in cv.split(X=np.zeros_like(y), y=y)]
+
+    if not np.all([len(train) for train, _ in cv_splits]):
+        raise ValueError("Some folds do not have any train epochs.")
+
+    return cv, cv_splits
+
+
+def _check_estimator(estimator, get_params=True):
+    """Check whether an object has the methods required by sklearn."""
+    valid_methods = ("predict", "transform", "predict_proba", "decision_function")
+    if (not hasattr(estimator, "fit")) or (
+        not any(hasattr(estimator, method) for method in valid_methods)
+    ):
+        raise ValueError(
+            "estimator must be a scikit-learn transformer or "
+            "an estimator with the fit and a predict-like (e.g. "
+            "predict_proba) or a transform method."
+        )
+
+    if get_params and not hasattr(estimator, "get_params"):
+        raise ValueError(
+            "estimator must be a scikit-learn transformer or an "
+            "estimator with the get_params method that allows "
+            "cloning."
+        )
+
+
+def _get_inverse_funcs(estimator, terminal=True):
+    """Retrieve the inverse functions of an pipeline or an estimator."""
+    inverse_func = list()
+    estimators = list()
+    if hasattr(estimator, "steps"):
+        # if pipeline, retrieve all steps by nesting
+        for _, est in estimator.steps:
+            inverse_func.extend(_get_inverse_funcs(est, terminal=False))
+            estimators.append(est.__class__.__name__)
+    elif hasattr(estimator, "inverse_transform"):
+        # if not pipeline attempt to retrieve inverse function
+        inverse_func.append(estimator.inverse_transform)
+        estimators.append(estimator.__class__.__name__)
+    else:
+        inverse_func.append(False)
+        estimators.append("Unknown")
+
+    # If terminal node, check that that the last estimator is a classifier,
+    # and remove it from the transformers.
+    if terminal:
+        last_is_estimator = inverse_func[-1] is False
+        logger.debug(f"  Last estimator is an estimator: {last_is_estimator}")
+        non_invertible = np.where(
+            [inv_func is False for inv_func in inverse_func[:-1]]
+        )[0]
+        if last_is_estimator and len(non_invertible) == 0:
+            # keep all inverse transformation and remove last estimation
+            logger.debug("  Removing inverse transformation from inverse list.")
+            inverse_func = inverse_func[:-1]
+        else:
+            if len(non_invertible):
+                bad = ", ".join(estimators[ni] for ni in non_invertible)
+                warn(
+                    f"Cannot inverse transform non-invertible "
+                    f"estimator{_pl(non_invertible)}: {bad}."
+                )
+            inverse_func = list()
+
+    return inverse_func
+
+
+@verbose
+def get_coef(estimator, attr="filters_", inverse_transform=False, *, verbose=None):
+    """Retrieve the coefficients of an estimator ending with a Linear Model.
+
+    This is typically useful to retrieve "spatial filters" or "spatial
+    patterns" of decoding models :footcite:`HaufeEtAl2014`.
+
+    Parameters
+    ----------
+    estimator : object | None
+        An estimator from scikit-learn.
+    attr : str
+        The name of the coefficient attribute to retrieve, typically
+        ``'filters_'`` (default) or ``'patterns_'``.
+    inverse_transform : bool
+        If True, returns the coefficients after inverse transforming them with
+        the transformer steps of the estimator.
+    %(verbose)s
+
+    Returns
+    -------
+    coef : array
+        The coefficients.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+    # Get the coefficients of the last estimator in case of nested pipeline
+    est = estimator
+    logger.debug(f"Getting coefficients from estimator: {est.__class__.__name__}")
+    while hasattr(est, "steps"):
+        est = est.steps[-1][1]
+
+    squeeze_first_dim = False
+
+    # If SlidingEstimator, loop across estimators
+    if hasattr(est, "estimators_"):
+        coef = list()
+        for ei, this_est in enumerate(est.estimators_):
+            if ei == 0:
+                logger.debug("  Extracting coefficients from SlidingEstimator.")
+            coef.append(get_coef(this_est, attr, inverse_transform))
+        coef = np.transpose(coef)
+        coef = coef[np.newaxis]  # fake a sample dimension
+        squeeze_first_dim = True
+    elif not hasattr(est, attr):
+        raise ValueError(f"This estimator does not have a {attr} attribute:\n{est}")
+    else:
+        coef = getattr(est, attr)
+
+    if coef.ndim == 1:
+        coef = coef[np.newaxis]
+        squeeze_first_dim = True
+
+    # inverse pattern e.g. to get back physical units
+    if inverse_transform:
+        if not hasattr(estimator, "steps") and not hasattr(est, "estimators_"):
+            raise ValueError(
+                "inverse_transform can only be applied onto pipeline estimators."
+            )
+        # The inverse_transform parameter will call this method on any
+        # estimator contained in the pipeline, in reverse order.
+        for inverse_func in _get_inverse_funcs(estimator)[::-1]:
+            logger.debug(f"  Applying inverse transformation: {inverse_func}.")
+            coef = inverse_func(coef)
+
+    if squeeze_first_dim:
+        logger.debug("  Squeezing first dimension of coefficients.")
+        coef = coef[0]
+
+    return coef
+
+
+@verbose
+def cross_val_multiscore(
+    estimator,
+    X,
+    y=None,
+    groups=None,
+    scoring=None,
+    cv=None,
+    n_jobs=None,
+    verbose=None,
+    fit_params=None,
+    pre_dispatch="2*n_jobs",
+):
+    """Evaluate a score by cross-validation.
+
+    Parameters
+    ----------
+    estimator : instance of sklearn.base.BaseEstimator
+        The object to use to fit the data.
+        Must implement the 'fit' method.
+    X : array-like, shape (n_samples, n_dimensional_features,)
+        The data to fit. Can be, for example a list, or an array at least 2d.
+    y : array-like, shape (n_samples, n_targets,)
+        The target variable to try to predict in the case of
+        supervised learning.
+    groups : array-like, with shape (n_samples,)
+        Group labels for the samples used while splitting the dataset into
+        train/test set.
+    scoring : str, callable | None
+        A string (see model evaluation documentation) or
+        a scorer callable object / function with signature
+        ``scorer(estimator, X, y)``.
+        Note that when using an estimator which inherently returns
+        multidimensional output - in particular, SlidingEstimator
+        or GeneralizingEstimator - you should set the scorer
+        there, not here.
+    cv : int, cross-validation generator | iterable
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - integer, to specify the number of folds in a ``(Stratified)KFold``,
+        - An object to be used as a cross-validation generator.
+        - An iterable yielding train, test splits.
+
+        For integer/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass,
+        :class:`sklearn.model_selection.StratifiedKFold` is used. In all
+        other cases, :class:`sklearn.model_selection.KFold` is used.
+    %(n_jobs)s
+    %(verbose)s
+    fit_params : dict, optional
+        Parameters to pass to the fit method of the estimator.
+    pre_dispatch : int, or str, optional
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+        - None, in which case all the jobs are immediately
+          created and spawned. Use this for lightweight and
+          fast-running jobs, to avoid delays due to on-demand
+          spawning of the jobs
+        - An int, giving the exact number of total jobs that are
+          spawned
+        - A string, giving an expression as a function of n_jobs,
+          as in '2*n_jobs'
+
+    Returns
+    -------
+    scores : array of float, shape (n_splits,) | shape (n_splits, n_scores)
+        Array of scores of the estimator for each run of the cross validation.
+    """
+    # This code is copied from sklearn
+    X, y, groups = indexable(X, y, groups)
+
+    cv = check_cv(cv, y, classifier=is_classifier(estimator))
+    cv_iter = list(cv.split(X, y, groups))
+    scorer = check_scoring(estimator, scoring=scoring)
+    # We clone the estimator to make sure that all the folds are
+    # independent, and that it is pickle-able.
+    # Note: this parallelization is implemented using MNE Parallel
+    parallel, p_func, n_jobs = parallel_func(
+        _fit_and_score, n_jobs, pre_dispatch=pre_dispatch
+    )
+    position = hasattr(estimator, "position")
+    scores = parallel(
+        p_func(
+            estimator=clone(estimator),
+            X=X,
+            y=y,
+            scorer=scorer,
+            train=train,
+            test=test,
+            fit_params=fit_params,
+            verbose=verbose,
+            parameters=dict(position=ii % n_jobs) if position else None,
+        )
+        for ii, (train, test) in enumerate(cv_iter)
+    )
+    return np.array(scores)[:, 0, ...]  # flatten over joblib output.
+
+
+# This verbose is necessary to properly set the verbosity level
+# during parallelization
+@verbose
+def _fit_and_score(
+    estimator,
+    X,
+    y,
+    scorer,
+    train,
+    test,
+    parameters,
+    fit_params,
+    return_train_score=False,
+    return_parameters=False,
+    return_n_test_samples=False,
+    return_times=False,
+    error_score="raise",
+    *,
+    verbose=None,
+    position=0,
+):
+    """Fit estimator and compute scores for a given dataset split."""
+    #  This code is adapted from sklearn
+    from sklearn.model_selection import _validation
+    from sklearn.utils.metaestimators import _safe_split
+    from sklearn.utils.validation import _num_samples
+
+    # Adjust length of sample weights
+
+    fit_params = fit_params if fit_params is not None else {}
+    fit_params = {
+        k: _validation._index_param_value(X, v, train) for k, v in fit_params.items()
+    }
+
+    if parameters is not None:
+        estimator.set_params(**parameters)
+
+    start_time = dt.datetime.now()
+
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+
+    try:
+        if y_train is None:
+            estimator.fit(X_train, **fit_params)
+        else:
+            estimator.fit(X_train, y_train, **fit_params)
+
+    except Exception as e:
+        # Note fit time as time until error
+        fit_duration = dt.datetime.now() - start_time
+        score_duration = dt.timedelta(0)
+        if error_score == "raise":
+            raise
+        elif isinstance(error_score, numbers.Number):
+            test_score = error_score
+            if return_train_score:
+                train_score = error_score
+            warn(
+                "Classifier fit failed. The score on this train-test partition for "
+                f"these parameters will be set to {error_score}. Details: \n{e!r}"
+            )
+        else:
+            raise ValueError(
+                "error_score must be the string 'raise' or a numeric value. (Hint: if "
+                "using 'raise', please make sure that it has been spelled correctly.)"
+            )
+
+    else:
+        fit_duration = dt.datetime.now() - start_time
+        test_score = _score(estimator, X_test, y_test, scorer)
+        score_duration = dt.datetime.now() - start_time - fit_duration
+        if return_train_score:
+            train_score = _score(estimator, X_train, y_train, scorer)
+
+    ret = [train_score, test_score] if return_train_score else [test_score]
+
+    if return_n_test_samples:
+        ret.append(_num_samples(X_test))
+    if return_times:
+        ret.extend([fit_duration.total_seconds(), score_duration.total_seconds()])
+    if return_parameters:
+        ret.append(parameters)
+    return ret
+
+
+def _score(estimator, X_test, y_test, scorer):
+    """Compute the score of an estimator on a given test set.
+
+    This code is the same as sklearn.model_selection._validation._score
+    but accepts to output arrays instead of floats.
+    """
+    if y_test is None:
+        score = scorer(estimator, X_test)
+    else:
+        score = scorer(estimator, X_test, y_test)
+    if hasattr(score, "item"):
+        try:
+            # e.g. unwrap memmapped scalars
+            score = score.item()
+        except ValueError:
+            # non-scalar?
+            pass
+    return score
--- a/mne/decoding/csp.py
+++ b/mne/decoding/csp.py
--- a/mne/decoding/ems.py
+++ b/mne/decoding/ems.py
@@ -0,0 +1,221 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+from collections import Counter
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from .._fiff.pick import _picks_to_idx, pick_info, pick_types
+from ..parallel import parallel_func
+from ..utils import logger, verbose
+from .base import _set_cv
+
+
+class EMS(TransformerMixin, BaseEstimator):
+    """Transformer to compute event-matched spatial filters.
+
+    This version of EMS :footcite:`SchurgerEtAl2013` operates on the entire
+    time course. No time
+    window needs to be specified. The result is a spatial filter at each
+    time point and a corresponding time course. Intuitively, the result
+    gives the similarity between the filter at each time point and the
+    data vector (sensors) at that time point.
+
+    .. note:: EMS only works for binary classification.
+
+    Attributes
+    ----------
+    filters_ : ndarray, shape (n_channels, n_times)
+        The set of spatial filters.
+    classes_ : ndarray, shape (n_classes,)
+        The target classes.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+
+    def __repr__(self):  # noqa: D105
+        if hasattr(self, "filters_"):
+            return (
+                f"<EMS: fitted with {len(self.filters_)} filters "
+                f"on {len(self.classes_)} classes.>"
+            )
+        else:
+            return "<EMS: not fitted.>"
+
+    def fit(self, X, y):
+        """Fit the spatial filters.
+
+        .. note : EMS is fitted on data normalized by channel type before the
+                  fitting of the spatial filters.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The training data.
+        y : array of int, shape (n_epochs)
+            The target classes.
+
+        Returns
+        -------
+        self : instance of EMS
+            Returns self.
+        """
+        classes = np.unique(y)
+        if len(classes) != 2:
+            raise ValueError("EMS only works for binary classification.")
+        self.classes_ = classes
+        filters = X[y == classes[0]].mean(0) - X[y == classes[1]].mean(0)
+        filters /= np.linalg.norm(filters, axis=0)[None, :]
+        self.filters_ = filters
+        return self
+
+    def transform(self, X):
+        """Transform the data by the spatial filters.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The input data.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_times)
+            The input data transformed by the spatial filters.
+        """
+        Xt = np.sum(X * self.filters_, axis=1)
+        return Xt
+
+
+@verbose
+def compute_ems(
+    epochs, conditions=None, picks=None, n_jobs=None, cv=None, verbose=None
+):
+    """Compute event-matched spatial filter on epochs.
+
+    This version of EMS :footcite:`SchurgerEtAl2013` operates on the entire
+    time course. No time
+    window needs to be specified. The result is a spatial filter at each
+    time point and a corresponding time course. Intuitively, the result
+    gives the similarity between the filter at each time point and the
+    data vector (sensors) at that time point.
+
+    .. note : EMS only works for binary classification.
+
+    .. note : The present function applies a leave-one-out cross-validation,
+              following Schurger et al's paper. However, we recommend using
+              a stratified k-fold cross-validation. Indeed, leave-one-out tends
+              to overfit and cannot be used to estimate the variance of the
+              prediction within a given fold.
+
+    .. note : Because of the leave-one-out, this function needs an equal
+              number of epochs in each of the two conditions.
+
+    Parameters
+    ----------
+    epochs : instance of mne.Epochs
+        The epochs.
+    conditions : list of str | None, default None
+        If a list of strings, strings must match the epochs.event_id's key as
+        well as the number of conditions supported by the objective_function.
+        If None keys in epochs.event_id are used.
+    %(picks_good_data)s
+    %(n_jobs)s
+    cv : cross-validation object | str | None, default LeaveOneOut
+        The cross-validation scheme.
+    %(verbose)s
+
+    Returns
+    -------
+    surrogate_trials : ndarray, shape (n_trials // 2, n_times)
+        The trial surrogates.
+    mean_spatial_filter : ndarray, shape (n_channels, n_times)
+        The set of spatial filters.
+    conditions : ndarray, shape (n_classes,)
+        The conditions used. Values correspond to original event ids.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+    logger.info("...computing surrogate time series. This can take some time")
+
+    # Default to leave-one-out cv
+    cv = "LeaveOneOut" if cv is None else cv
+    picks = _picks_to_idx(epochs.info, picks)
+
+    if not len(set(Counter(epochs.events[:, 2]).values())) == 1:
+        raise ValueError(
+            "The same number of epochs is required by "
+            "this function. Please consider "
+            "`epochs.equalize_event_counts`"
+        )
+
+    if conditions is None:
+        conditions = epochs.event_id.keys()
+        epochs = epochs.copy()
+    else:
+        epochs = epochs[conditions]
+
+    epochs.drop_bad()
+
+    if len(conditions) != 2:
+        raise ValueError(
+            "Currently this function expects exactly 2 "
+            f"conditions but you gave me {len(conditions)}"
+        )
+
+    ev = epochs.events[:, 2]
+    # Special care to avoid path dependent mappings and orders
+    conditions = list(sorted(conditions))
+    cond_idx = [np.where(ev == epochs.event_id[k])[0] for k in conditions]
+
+    info = pick_info(epochs.info, picks)
+    data = epochs.get_data(picks=picks)
+
+    # Scale (z-score) the data by channel type
+    # XXX the z-scoring is applied outside the CV, which is not standard.
+    for ch_type in ["mag", "grad", "eeg"]:
+        if ch_type in epochs:
+            # FIXME should be applied to all sort of data channels
+            if ch_type == "eeg":
+                this_picks = pick_types(info, meg=False, eeg=True)
+            else:
+                this_picks = pick_types(info, meg=ch_type, eeg=False)
+            data[:, this_picks] /= np.std(data[:, this_picks])
+
+    # Setup cross-validation. Need to use _set_cv to deal with sklearn
+    # deprecation of cv objects.
+    y = epochs.events[:, 2]
+    _, cv_splits = _set_cv(cv, "classifier", X=y, y=y)
+
+    parallel, p_func, n_jobs = parallel_func(_run_ems, n_jobs=n_jobs)
+    # FIXME this parallelization should be removed.
+    #   1) it's numpy computation so it's already efficient,
+    #   2) it duplicates the data in RAM,
+    #   3) the computation is already super fast.
+    out = parallel(
+        p_func(_ems_diff, data, cond_idx, train, test) for train, test in cv_splits
+    )
+
+    surrogate_trials, spatial_filter = zip(*out)
+    surrogate_trials = np.array(surrogate_trials)
+    spatial_filter = np.mean(spatial_filter, axis=0)
+
+    return surrogate_trials, spatial_filter, epochs.events[:, 2]
+
+
+def _ems_diff(data0, data1):
+    """Compute the default diff objective function."""
+    return np.mean(data0, axis=0) - np.mean(data1, axis=0)
+
+
+def _run_ems(objective_function, data, cond_idx, train, test):
+    """Run EMS."""
+    d = objective_function(*(data[np.intersect1d(c, train)] for c in cond_idx))
+    d /= np.sqrt(np.sum(d**2, axis=0))[None, :]
+    # compute surrogates
+    return np.sum(data[test[0]] * d, axis=0), d
--- a/mne/decoding/receptive_field.py
+++ b/mne/decoding/receptive_field.py
@@ -0,0 +1,521 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import numbers
+
+import numpy as np
+from scipy.stats import pearsonr
+from sklearn.base import (
+    BaseEstimator,
+    MetaEstimatorMixin,
+    clone,
+    is_regressor,
+)
+from sklearn.exceptions import NotFittedError
+from sklearn.metrics import r2_score
+
+from ..utils import _validate_type, fill_doc, pinv
+from .base import _check_estimator, get_coef
+from .time_delaying_ridge import TimeDelayingRidge
+
+
+@fill_doc
+class ReceptiveField(MetaEstimatorMixin, BaseEstimator):
+    """Fit a receptive field model.
+
+    This allows you to fit an encoding model (stimulus to brain) or a decoding
+    model (brain to stimulus) using time-lagged input features (for example, a
+    spectro- or spatio-temporal receptive field, or STRF)
+    :footcite:`TheunissenEtAl2001,WillmoreSmyth2003,CrosseEtAl2016,HoldgrafEtAl2016`.
+
+    Parameters
+    ----------
+    tmin : float
+        The starting lag, in seconds (or samples if ``sfreq`` == 1).
+    tmax : float
+        The ending lag, in seconds (or samples if ``sfreq`` == 1).
+        Must be >= tmin.
+    sfreq : float
+        The sampling frequency used to convert times into samples.
+    feature_names : array, shape (n_features,) | None
+        Names for input features to the model. If None, feature names will
+        be auto-generated from the shape of input data after running `fit`.
+    estimator : instance of sklearn.base.BaseEstimator | float | None
+        The model used in fitting inputs and outputs. This can be any
+        scikit-learn-style model that contains a fit and predict method. If a
+        float is passed, it will be interpreted as the ``alpha`` parameter
+        to be passed to a Ridge regression model. If `None`, then a Ridge
+        regression model with an alpha of 0 will be used.
+    fit_intercept : bool | None
+        If True (default), the sample mean is removed before fitting.
+        If ``estimator`` is a :class:`sklearn.base.BaseEstimator`,
+        this must be None or match ``estimator.fit_intercept``.
+    scoring : ['r2', 'corrcoef']
+        Defines how predictions will be scored. Currently must be one of
+        'r2' (coefficient of determination) or 'corrcoef' (the correlation
+        coefficient).
+    patterns : bool
+        If True, inverse coefficients will be computed upon fitting using the
+        covariance matrix of the inputs, and the cross-covariance of the
+        inputs/outputs, according to :footcite:`HaufeEtAl2014`. Defaults to
+        False.
+    n_jobs : int | str
+        Number of jobs to run in parallel. Can be 'cuda' if CuPy
+        is installed properly and ``estimator is None``.
+
+        .. versionadded:: 0.18
+    edge_correction : bool
+        If True (default), correct the autocorrelation coefficients for
+        non-zero delays for the fact that fewer samples are available.
+        Disabling this speeds up performance at the cost of accuracy
+        depending on the relationship between epoch length and model
+        duration. Only used if ``estimator`` is float or None.
+
+        .. versionadded:: 0.18
+
+    Attributes
+    ----------
+    coef_ : array, shape ([n_outputs, ]n_features, n_delays)
+        The coefficients from the model fit, reshaped for easy visualization.
+        During :meth:`mne.decoding.ReceptiveField.fit`, if ``y`` has one
+        dimension (time), the ``n_outputs`` dimension here is omitted.
+    patterns_ : array, shape ([n_outputs, ]n_features, n_delays)
+        If fit, the inverted coefficients from the model.
+    delays_ : array, shape (n_delays,), dtype int
+        The delays used to fit the model, in indices. To return the delays
+        in seconds, use ``self.delays_ / self.sfreq``
+    valid_samples_ : slice
+        The rows to keep during model fitting after removing rows with
+        missing values due to time delaying. This can be used to get an
+        output equivalent to using :func:`numpy.convolve` or
+        :func:`numpy.correlate` with ``mode='valid'``.
+
+    See Also
+    --------
+    mne.decoding.TimeDelayingRidge
+
+    Notes
+    -----
+    For a causal system, the encoding model will have significant
+    non-zero values only at positive lags. In other words, lags point
+    backward in time relative to the input, so positive lags correspond
+    to previous input time samples, while negative lags correspond to
+    future input time samples.
+
+    References
+    ----------
+    .. footbibliography::
+    """  # noqa E501
+
+    def __init__(
+        self,
+        tmin,
+        tmax,
+        sfreq,
+        feature_names=None,
+        estimator=None,
+        fit_intercept=None,
+        scoring="r2",
+        patterns=False,
+        n_jobs=None,
+        edge_correction=True,
+    ):
+        self.tmin = tmin
+        self.tmax = tmax
+        self.sfreq = sfreq
+        self.feature_names = feature_names
+        self.estimator = 0.0 if estimator is None else estimator
+        self.fit_intercept = fit_intercept
+        self.scoring = scoring
+        self.patterns = patterns
+        self.n_jobs = n_jobs
+        self.edge_correction = edge_correction
+
+    def __repr__(self):  # noqa: D105
+        s = f"tmin, tmax : ({self.tmin:.3f}, {self.tmax:.3f}), "
+        estimator = self.estimator
+        if not isinstance(estimator, str):
+            estimator = type(self.estimator)
+        s += f"estimator : {estimator}, "
+        if hasattr(self, "coef_"):
+            if self.feature_names is not None:
+                feats = self.feature_names
+                if len(feats) == 1:
+                    s += f"feature: {feats[0]}, "
+                else:
+                    s += f"features : [{feats[0]}, ..., {feats[-1]}], "
+            s += "fit: True"
+        else:
+            s += "fit: False"
+        if hasattr(self, "scores_"):
+            s += f"scored ({self.scoring})"
+        return f"<ReceptiveField | {s}>"
+
+    def _delay_and_reshape(self, X, y=None):
+        """Delay and reshape the variables."""
+        if not isinstance(self.estimator_, TimeDelayingRidge):
+            # X is now shape (n_times, n_epochs, n_feats, n_delays)
+            X = _delay_time_series(
+                X,
+                self.tmin,
+                self.tmax,
+                self.sfreq_,
+                fill_mean=self.fit_intercept_,
+            )
+            X = _reshape_for_est(X)
+            # Concat times + epochs
+            if y is not None:
+                y = y.reshape(-1, y.shape[-1], order="F")
+        return X, y
+
+    def fit(self, X, y):
+        """Fit a receptive field model.
+
+        Parameters
+        ----------
+        X : array, shape (n_times[, n_epochs], n_features)
+            The input features for the model.
+        y : array, shape (n_times[, n_epochs][, n_outputs])
+            The output features for the model.
+
+        Returns
+        -------
+        self : instance
+            The instance so you can chain operations.
+        """
+        if self.scoring not in _SCORERS.keys():
+            raise ValueError(
+                f"scoring must be one of {sorted(_SCORERS.keys())}, got {self.scoring} "
+            )
+        self.sfreq_ = float(self.sfreq)
+        X, y, _, self._y_dim = self._check_dimensions(X, y)
+
+        if self.tmin > self.tmax:
+            raise ValueError(f"tmin ({self.tmin}) must be at most tmax ({self.tmax})")
+        # Initialize delays
+        self.delays_ = _times_to_delays(self.tmin, self.tmax, self.sfreq_)
+
+        # Define the slice that we should use in the middle
+        self.valid_samples_ = _delays_to_slice(self.delays_)
+
+        if isinstance(self.estimator, numbers.Real):
+            if self.fit_intercept is None:
+                self.fit_intercept_ = True
+            else:
+                self.fit_intercept_ = self.fit_intercept
+            estimator = TimeDelayingRidge(
+                self.tmin,
+                self.tmax,
+                self.sfreq_,
+                alpha=self.estimator,
+                fit_intercept=self.fit_intercept_,
+                n_jobs=self.n_jobs,
+                edge_correction=self.edge_correction,
+            )
+        elif is_regressor(self.estimator):
+            estimator = clone(self.estimator)
+            if (
+                self.fit_intercept is not None
+                and estimator.fit_intercept != self.fit_intercept
+            ):
+                raise ValueError(
+                    f"Estimator fit_intercept ({estimator.fit_intercept}) != "
+                    f"initialization fit_intercept ({self.fit_intercept}), initialize "
+                    "ReceptiveField with the same fit_intercept value or use "
+                    "fit_intercept=None"
+                )
+            self.fit_intercept_ = estimator.fit_intercept
+        else:
+            raise ValueError(
+                "`estimator` must be a float or an instance of `BaseEstimator`, got "
+                f"type {self.estimator}."
+            )
+        self.estimator_ = estimator
+        del estimator
+        _check_estimator(self.estimator_)
+
+        # Create input features
+        n_times, n_epochs, n_feats = X.shape
+        n_outputs = y.shape[-1]
+        n_delays = len(self.delays_)
+
+        # Update feature names if we have none
+        if (self.feature_names is not None) and (len(self.feature_names) != n_feats):
+            raise ValueError(
+                f"n_features in X does not match feature names ({n_feats} != "
+                f"{len(self.feature_names)})"
+            )
+
+        # Create input features
+        X, y = self._delay_and_reshape(X, y)
+
+        self.estimator_.fit(X, y)
+        coef = get_coef(self.estimator_, "coef_")  # (n_targets, n_features)
+        shape = [n_feats, n_delays]
+        if self._y_dim > 1:
+            shape.insert(0, -1)
+        self.coef_ = coef.reshape(shape)
+
+        # Inverse-transform model weights
+        if self.patterns:
+            if isinstance(self.estimator_, TimeDelayingRidge):
+                cov_ = self.estimator_.cov_ / float(n_times * n_epochs - 1)
+                y = y.reshape(-1, y.shape[-1], order="F")
+            else:
+                X = X - X.mean(0, keepdims=True)
+                cov_ = np.cov(X.T)
+            del X
+
+            # Inverse output covariance
+            if y.ndim == 2 and y.shape[1] != 1:
+                y = y - y.mean(0, keepdims=True)
+                inv_Y = pinv(np.cov(y.T))
+            else:
+                inv_Y = 1.0 / float(n_times * n_epochs - 1)
+            del y
+
+            # Inverse coef according to Haufe's method
+            # patterns has shape (n_feats * n_delays, n_outputs)
+            coef = np.reshape(self.coef_, (n_feats * n_delays, n_outputs))
+            patterns = cov_.dot(coef.dot(inv_Y))
+            self.patterns_ = patterns.reshape(shape)
+
+        return self
+
+    def predict(self, X):
+        """Generate predictions with a receptive field.
+
+        Parameters
+        ----------
+        X : array, shape (n_times[, n_epochs], n_channels)
+            The input features for the model.
+
+        Returns
+        -------
+        y_pred : array, shape (n_times[, n_epochs][, n_outputs])
+            The output predictions. "Note that valid samples (those
+            unaffected by edge artifacts during the time delaying step) can
+            be obtained using ``y_pred[rf.valid_samples_]``.
+        """
+        if not hasattr(self, "delays_"):
+            raise NotFittedError("Estimator has not been fit yet.")
+        X, _, X_dim = self._check_dimensions(X, None, predict=True)[:3]
+        del _
+        # convert to sklearn and back
+        pred_shape = X.shape[:-1]
+        if self._y_dim > 1:
+            pred_shape = pred_shape + (self.coef_.shape[0],)
+        X, _ = self._delay_and_reshape(X)
+        y_pred = self.estimator_.predict(X)
+        y_pred = y_pred.reshape(pred_shape, order="F")
+        shape = list(y_pred.shape)
+        if X_dim <= 2:
+            shape.pop(1)  # epochs
+            extra = 0
+        else:
+            extra = 1
+        shape = shape[: self._y_dim + extra]
+        y_pred.shape = shape
+        return y_pred
+
+    def score(self, X, y):
+        """Score predictions generated with a receptive field.
+
+        This calls ``self.predict``, then masks the output of this
+        and ``y` with ``self.valid_samples_``. Finally, it passes
+        this to a :mod:`sklearn.metrics` scorer.
+
+        Parameters
+        ----------
+        X : array, shape (n_times[, n_epochs], n_channels)
+            The input features for the model.
+        y : array, shape (n_times[, n_epochs][, n_outputs])
+            Used for scikit-learn compatibility.
+
+        Returns
+        -------
+        scores : list of float, shape (n_outputs,)
+            The scores estimated by the model for each output (e.g. mean
+            R2 of ``predict(X)``).
+        """
+        # Create our scoring object
+        scorer_ = _SCORERS[self.scoring]
+
+        # Generate predictions, then reshape so we can mask time
+        X, y = self._check_dimensions(X, y, predict=True)[:2]
+        n_times, n_epochs, n_outputs = y.shape
+        y_pred = self.predict(X)
+        y_pred = y_pred[self.valid_samples_]
+        y = y[self.valid_samples_]
+
+        # Re-vectorize and call scorer
+        y = y.reshape([-1, n_outputs], order="F")
+        y_pred = y_pred.reshape([-1, n_outputs], order="F")
+        assert y.shape == y_pred.shape
+        scores = scorer_(y, y_pred, multioutput="raw_values")
+        return scores
+
+    def _check_dimensions(self, X, y, predict=False):
+        _validate_type(X, "array-like", "X")
+        _validate_type(y, ("array-like", None), "y")
+        X_dim = X.ndim
+        y_dim = y.ndim if y is not None else 0
+        if X_dim == 2:
+            # Ensure we have a 3D input by adding singleton epochs dimension
+            X = X[:, np.newaxis, :]
+            if y is not None:
+                if y_dim == 1:
+                    y = y[:, np.newaxis, np.newaxis]  # epochs, outputs
+                elif y_dim == 2:
+                    y = y[:, np.newaxis, :]  # epochs
+                else:
+                    raise ValueError(
+                        "y must be shape (n_times[, n_epochs][,n_outputs], got "
+                        f"{y.shape}"
+                    )
+        elif X.ndim == 3:
+            if y is not None:
+                if y.ndim == 2:
+                    y = y[:, :, np.newaxis]  # Add an outputs dim
+                elif y.ndim != 3:
+                    raise ValueError(
+                        "If X has 3 dimensions, y must have 2 or 3 dimensions"
+                    )
+        else:
+            raise ValueError(
+                f"X must be shape (n_times[, n_epochs], n_features), got {X.shape}"
+            )
+        if y is not None:
+            if X.shape[0] != y.shape[0]:
+                raise ValueError(
+                    f"X and y do not have the same n_times\n{X.shape[0]} != "
+                    f"{y.shape[0]}"
+                )
+            if X.shape[1] != y.shape[1]:
+                raise ValueError(
+                    f"X and y do not have the same n_epochs\n{X.shape[1]} != "
+                    f"{y.shape[1]}"
+                )
+            if predict and y.shape[-1] not in (len(self.estimator_.coef_), 1):
+                raise ValueError(
+                    "Number of outputs does not match estimator coefficients dimensions"
+                )
+        return X, y, X_dim, y_dim
+
+
+def _delay_time_series(X, tmin, tmax, sfreq, fill_mean=False):
+    """Return a time-lagged input time series.
+
+    Parameters
+    ----------
+    X : array, shape (n_times[, n_epochs], n_features)
+        The time series to delay. Must be 2D or 3D.
+    tmin : int | float
+        The starting lag.
+    tmax : int | float
+        The ending lag.
+        Must be >= tmin.
+    sfreq : int | float
+        The sampling frequency of the series. Defaults to 1.0.
+    fill_mean : bool
+        If True, the fill value will be the mean along the time dimension
+        of the feature, and each cropped and delayed segment of data
+        will be shifted to have the same mean value (ensuring that mean
+        subtraction works properly). If False, the fill value will be zero.
+
+    Returns
+    -------
+    delayed : array, shape(n_times[, n_epochs][, n_features], n_delays)
+        The delayed data. It has the same shape as X, with an extra dimension
+        appended to the end.
+
+    Examples
+    --------
+    >>> tmin, tmax = -0.1, 0.2
+    >>> sfreq = 10.
+    >>> x = np.arange(1, 6)
+    >>> x_del = _delay_time_series(x, tmin, tmax, sfreq)
+    >>> print(x_del)  # doctest:+SKIP
+    [[2. 1. 0. 0.]
+     [3. 2. 1. 0.]
+     [4. 3. 2. 1.]
+     [5. 4. 3. 2.]
+     [0. 5. 4. 3.]]
+    """
+    _check_delayer_params(tmin, tmax, sfreq)
+    delays = _times_to_delays(tmin, tmax, sfreq)
+    # Iterate through indices and append
+    delayed = np.zeros(X.shape + (len(delays),))
+    if fill_mean:
+        mean_value = X.mean(axis=0)
+        if X.ndim == 3:
+            mean_value = np.mean(mean_value, axis=0)
+        delayed[:] = mean_value[:, np.newaxis]
+    for ii, ix_delay in enumerate(delays):
+        # Create zeros to populate w/ delays
+        if ix_delay < 0:
+            out = delayed[:ix_delay, ..., ii]
+            use_X = X[-ix_delay:]
+        elif ix_delay > 0:
+            out = delayed[ix_delay:, ..., ii]
+            use_X = X[:-ix_delay]
+        else:  # == 0
+            out = delayed[..., ii]
+            use_X = X
+        out[:] = use_X
+        if fill_mean:
+            out[:] += mean_value - use_X.mean(axis=0)
+    return delayed
+
+
+def _times_to_delays(tmin, tmax, sfreq):
+    """Convert a tmin/tmax in seconds to delays."""
+    # Convert seconds to samples
+    delays = np.arange(int(np.round(tmin * sfreq)), int(np.round(tmax * sfreq) + 1))
+    return delays
+
+
+def _delays_to_slice(delays):
+    """Find the slice to be taken in order to remove missing values."""
+    # Negative values == cut off rows at the end
+    min_delay = None if delays[-1] <= 0 else delays[-1]
+    # Positive values == cut off rows at the end
+    max_delay = None if delays[0] >= 0 else delays[0]
+    return slice(min_delay, max_delay)
+
+
+def _check_delayer_params(tmin, tmax, sfreq):
+    """Check delayer input parameters. For future custom delay support."""
+    _validate_type(sfreq, "numeric", "`sfreq`")
+
+    for tlim in (tmin, tmax):
+        _validate_type(tlim, "numeric", "tmin/tmax")
+    if not tmin <= tmax:
+        raise ValueError("tmin must be <= tmax")
+
+
+def _reshape_for_est(X_del):
+    """Convert X_del to a sklearn-compatible shape."""
+    n_times, n_epochs, n_feats, n_delays = X_del.shape
+    X_del = X_del.reshape(n_times, n_epochs, -1)  # concatenate feats
+    X_del = X_del.reshape(n_times * n_epochs, -1, order="F")
+    return X_del
+
+
+# Create a correlation scikit-learn-style scorer
+def _corr_score(y_true, y, multioutput=None):
+    assert multioutput == "raw_values"
+    for this_y in (y_true, y):
+        if this_y.ndim != 2:
+            raise ValueError(
+                f"inputs must be shape (samples, outputs), got {this_y.shape}"
+            )
+    return np.array([pearsonr(y_true[:, ii], y[:, ii])[0] for ii in range(y.shape[-1])])
+
+
+def _r2_score(y_true, y, multioutput=None):
+    return r2_score(y_true, y, multioutput=multioutput)
+
+
+_SCORERS = {"r2": _r2_score, "corrcoef": _corr_score}
--- a/mne/decoding/search_light.py
+++ b/mne/decoding/search_light.py
@@ -0,0 +1,759 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import logging
+
+import numpy as np
+from sklearn.base import BaseEstimator, MetaEstimatorMixin, TransformerMixin, clone
+from sklearn.metrics import check_scoring
+from sklearn.preprocessing import LabelEncoder
+from sklearn.utils import check_array
+
+from ..parallel import parallel_func
+from ..utils import ProgressBar, _parse_verbose, array_split_idx, fill_doc, verbose
+from .base import _check_estimator
+
+
+@fill_doc
+class SlidingEstimator(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
+    """Search Light.
+
+    Fit, predict and score a series of models to each subset of the dataset
+    along the last dimension. Each entry in the last dimension is referred
+    to as a task.
+
+    Parameters
+    ----------
+    %(base_estimator)s
+    %(scoring)s
+    %(n_jobs)s
+    %(position)s
+    %(allow_2d)s
+    %(verbose)s
+
+    Attributes
+    ----------
+    estimators_ : array-like, shape (n_tasks,)
+        List of fitted scikit-learn estimators (one per task).
+    """
+
+    @verbose
+    def __init__(
+        self,
+        base_estimator,
+        scoring=None,
+        n_jobs=None,
+        *,
+        position=0,
+        allow_2d=False,
+        verbose=None,
+    ):
+        _check_estimator(base_estimator)
+        self.base_estimator = base_estimator
+        self.n_jobs = n_jobs
+        self.scoring = scoring
+        self.position = position
+        self.allow_2d = allow_2d
+        self.verbose = verbose
+
+    @property
+    def _estimator_type(self):
+        return getattr(self.base_estimator, "_estimator_type", None)
+
+    def __sklearn_tags__(self):
+        """Get sklearn tags."""
+        from sklearn.utils import get_tags
+
+        tags = super().__sklearn_tags__()
+        sub_tags = get_tags(self.base_estimator)
+        tags.estimator_type = sub_tags.estimator_type
+        for kind in ("classifier", "regressor", "transformer"):
+            if tags.estimator_type == kind:
+                attr = f"{kind}_tags"
+                setattr(tags, attr, getattr(sub_tags, attr))
+                break
+        return tags
+
+    def __repr__(self):  # noqa: D105
+        repr_str = "<" + super().__repr__()
+        if hasattr(self, "estimators_"):
+            repr_str = repr_str[:-1]
+            repr_str += f", fitted with {len(self.estimators_)} estimators"
+        return repr_str + ">"
+
+    def fit(self, X, y, **fit_params):
+        """Fit a series of independent estimators to the dataset.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_tasks)
+            The training input samples. For each data slice, a clone estimator
+            is fitted independently. The feature dimension can be
+            multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).
+        y : array, shape (n_samples,) | (n_samples, n_targets)
+            The target values.
+        **fit_params : dict of string -> object
+            Parameters to pass to the fit method of the estimator.
+
+        Returns
+        -------
+        self : object
+            Return self.
+        """
+        X = self._check_Xy(X, y)
+        parallel, p_func, n_jobs = parallel_func(
+            _sl_fit, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+        )
+        self.estimators_ = list()
+        self.fit_params_ = fit_params
+
+        # For fitting, the parallelization is across estimators.
+        context = _create_progressbar_context(self, X, "Fitting")
+        with context as pb:
+            estimators = parallel(
+                p_func(self.base_estimator, split, y, pb.subset(pb_idx), **fit_params)
+                for pb_idx, split in array_split_idx(X, n_jobs, axis=-1)
+            )
+
+        # Each parallel job can have a different number of training estimators
+        # We can't directly concatenate them because of sklearn's Bagging API
+        # (see scikit-learn #9720)
+        self.estimators_ = np.empty(X.shape[-1], dtype=object)
+        idx = 0
+        for job_estimators in estimators:
+            for est in job_estimators:
+                self.estimators_[idx] = est
+                idx += 1
+        return self
+
+    def fit_transform(self, X, y, **fit_params):
+        """Fit and transform a series of independent estimators to the dataset.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_tasks)
+            The training input samples. For each task, a clone estimator
+            is fitted independently. The feature dimension can be
+            multidimensional, e.g.::
+
+                X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
+        y : array, shape (n_samples,) | (n_samples, n_targets)
+            The target values.
+        **fit_params : dict of string -> object
+            Parameters to pass to the fit method of the estimator.
+
+        Returns
+        -------
+        y_pred : array, shape (n_samples, n_tasks) | (n_samples, n_tasks, n_targets)
+            The predicted values for each estimator.
+        """  # noqa: E501
+        return self.fit(X, y, **fit_params).transform(X)
+
+    def _transform(self, X, method):
+        """Aux. function to make parallel predictions/transformation."""
+        X = self._check_Xy(X)
+        method = _check_method(self.base_estimator, method)
+        if X.shape[-1] != len(self.estimators_):
+            raise ValueError("The number of estimators does not match X.shape[-1]")
+        # For predictions/transforms the parallelization is across the data and
+        # not across the estimators to avoid memory load.
+        parallel, p_func, n_jobs = parallel_func(
+            _sl_transform, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+        )
+
+        X_splits = np.array_split(X, n_jobs, axis=-1)
+        idx, est_splits = zip(*array_split_idx(self.estimators_, n_jobs))
+
+        context = _create_progressbar_context(self, X, "Transforming")
+        with context as pb:
+            y_pred = parallel(
+                p_func(est, x, method, pb.subset(pb_idx))
+                for pb_idx, est, x in zip(idx, est_splits, X_splits)
+            )
+
+        y_pred = np.concatenate(y_pred, axis=1)
+        return y_pred
+
+    def transform(self, X):
+        """Transform each data slice/task with a series of independent estimators.
+
+        The number of tasks in X should match the number of tasks/estimators
+        given at fit time.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_tasks)
+            The input samples. For each data slice/task, the corresponding
+            estimator makes a transformation of the data, e.g.
+            ``[estimators[ii].transform(X[..., ii]) for ii in range(n_estimators)]``.
+            The feature dimension can be multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).
+
+        Returns
+        -------
+        Xt : array, shape (n_samples, n_estimators)
+            The transformed values generated by each estimator.
+        """  # noqa: E501
+        return self._transform(X, "transform").astype(X.dtype)
+
+    def predict(self, X):
+        """Predict each data slice/task with a series of independent estimators.
+
+        The number of tasks in X should match the number of tasks/estimators
+        given at fit time.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_tasks)
+            The input samples. For each data slice, the corresponding estimator
+            makes the sample predictions, e.g.:
+            ``[estimators[ii].predict(X[..., ii]) for ii in range(n_estimators)]``.
+            The feature dimension can be multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).
+
+        Returns
+        -------
+        y_pred : array, shape (n_samples, n_estimators) | (n_samples, n_tasks, n_targets)
+            Predicted values for each estimator/data slice.
+        """  # noqa: E501
+        return self._transform(X, "predict")
+
+    def predict_proba(self, X):
+        """Predict each data slice with a series of independent estimators.
+
+        The number of tasks in X should match the number of tasks/estimators
+        given at fit time.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_tasks)
+            The input samples. For each data slice, the corresponding estimator
+            makes the sample probabilistic predictions, e.g.:
+            ``[estimators[ii].predict_proba(X[..., ii]) for ii in range(n_estimators)]``.
+            The feature dimension can be multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).
+
+        Returns
+        -------
+        y_pred : array, shape (n_samples, n_tasks, n_classes)
+            Predicted probabilities for each estimator/data slice/task.
+        """  # noqa: E501
+        return self._transform(X, "predict_proba")
+
+    def decision_function(self, X):
+        """Estimate distances of each data slice to the hyperplanes.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_tasks)
+            The input samples. For each data slice, the corresponding estimator
+            outputs the distance to the hyperplane, e.g.:
+            ``[estimators[ii].decision_function(X[..., ii]) for ii in range(n_estimators)]``.
+            The feature dimension can be multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_estimators).
+
+        Returns
+        -------
+        y_pred : array, shape (n_samples, n_estimators, n_classes * (n_classes-1) // 2)
+            Predicted distances for each estimator/data slice.
+
+        Notes
+        -----
+        This requires base_estimator to have a ``decision_function`` method.
+        """  # noqa: E501
+        return self._transform(X, "decision_function")
+
+    def _check_Xy(self, X, y=None):
+        """Aux. function to check input data."""
+        # Once we require sklearn 1.1+ we should do something like:
+        X = check_array(X, ensure_2d=False, allow_nd=True, input_name="X")
+        if y is not None:
+            y = check_array(y, dtype=None, ensure_2d=False, input_name="y")
+            if len(X) != len(y) or len(y) < 1:
+                raise ValueError("X and y must have the same length.")
+        if X.ndim < 3:
+            err = None
+            if not self.allow_2d:
+                err = 3
+            elif X.ndim < 2:
+                err = 2
+            if err:
+                raise ValueError(f"X must have at least {err} dimensions.")
+            X = X[..., np.newaxis]
+        return X
+
+    def score(self, X, y):
+        """Score each estimator on each task.
+
+        The number of tasks in X should match the number of tasks/estimators
+        given at fit time, i.e. we need
+        ``X.shape[-1] == len(self.estimators_)``.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_tasks)
+            The input samples. For each data slice, the corresponding estimator
+            scores the prediction, e.g.:
+            ``[estimators[ii].score(X[..., ii], y) for ii in range(n_estimators)]``.
+            The feature dimension can be multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).
+        y : array, shape (n_samples,) | (n_samples, n_targets)
+            The target values.
+
+        Returns
+        -------
+        score : array, shape (n_samples, n_estimators)
+            Score for each estimator/task.
+        """  # noqa: E501
+        X = self._check_Xy(X, y)
+        if X.shape[-1] != len(self.estimators_):
+            raise ValueError("The number of estimators does not match X.shape[-1]")
+
+        scoring = check_scoring(self.base_estimator, self.scoring)
+        y = _fix_auc(scoring, y)
+
+        # For predictions/transforms the parallelization is across the data and
+        # not across the estimators to avoid memory load.
+        parallel, p_func, n_jobs = parallel_func(
+            _sl_score, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+        )
+        X_splits = np.array_split(X, n_jobs, axis=-1)
+        est_splits = np.array_split(self.estimators_, n_jobs)
+        score = parallel(
+            p_func(est, scoring, x, y) for (est, x) in zip(est_splits, X_splits)
+        )
+
+        score = np.concatenate(score, axis=0)
+        return score
+
+    @property
+    def classes_(self):
+        if not hasattr(self.estimators_[0], "classes_"):
+            raise AttributeError(
+                "classes_ attribute available only if base_estimator has it, and "
+                f"estimator {self.estimators_[0]} does not"
+            )
+        return self.estimators_[0].classes_
+
+
+@fill_doc
+def _sl_fit(estimator, X, y, pb, **fit_params):
+    """Aux. function to fit SlidingEstimator in parallel.
+
+    Fit a clone estimator to each slice of data.
+
+    Parameters
+    ----------
+    %(base_estimator)s
+    X : array, shape (n_samples, nd_features, n_estimators)
+        The target data. The feature dimension can be multidimensional e.g.
+        X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
+    y : array, shape (n_sample, )
+        The target values.
+    pb : instance of ProgressBar
+        The progress bar to update.
+    fit_params : dict | None
+        Parameters to pass to the fit method of the estimator.
+
+    Returns
+    -------
+    estimators_ : list of estimators
+        The fitted estimators.
+    """
+    estimators_ = list()
+    for ii in range(X.shape[-1]):
+        est = clone(estimator)
+        est.fit(X[..., ii], y, **fit_params)
+        estimators_.append(est)
+
+        pb.update(ii + 1)
+    return estimators_
+
+
+def _sl_transform(estimators, X, method, pb):
+    """Aux. function to transform SlidingEstimator in parallel.
+
+    Applies transform/predict/decision_function etc for each slice of data.
+
+    Parameters
+    ----------
+    estimators : list of estimators
+        The fitted estimators.
+    X : array, shape (n_samples, nd_features, n_estimators)
+        The target data. The feature dimension can be multidimensional e.g.
+        X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
+    method : str
+        The estimator method to use (e.g. 'predict', 'transform').
+    pb : instance of ProgressBar
+        The progress bar to update.
+
+    Returns
+    -------
+    y_pred : array, shape (n_samples, n_estimators, n_classes * (n_classes-1) // 2)
+        The transformations for each slice of data.
+    """  # noqa: E501
+    for ii, est in enumerate(estimators):
+        transform = getattr(est, method)
+        _y_pred = transform(X[..., ii])
+        # Initialize array of predictions on the first transform iteration
+        if ii == 0:
+            y_pred = _sl_init_pred(_y_pred, X)
+        y_pred[:, ii, ...] = _y_pred
+
+        pb.update(ii + 1)
+    return y_pred
+
+
+def _sl_init_pred(y_pred, X):
+    """Aux. function to SlidingEstimator to initialize y_pred."""
+    n_sample, n_tasks = X.shape[0], X.shape[-1]
+    y_pred = np.zeros((n_sample, n_tasks) + y_pred.shape[1:], y_pred.dtype)
+    return y_pred
+
+
+def _sl_score(estimators, scoring, X, y):
+    """Aux. function to score SlidingEstimator in parallel.
+
+    Predict and score each slice of data.
+
+    Parameters
+    ----------
+    estimators : list, shape (n_tasks,)
+        The fitted estimators.
+    X : array, shape (n_samples, nd_features, n_tasks)
+        The target data. The feature dimension can be multidimensional e.g.
+        X.shape = (n_samples, n_features_1, n_features_2, n_tasks)
+    scoring : callable, str or None
+        If scoring is None (default), the predictions are internally
+        generated by estimator.score(). Else, we must first get the
+        predictions to pass them to ad-hoc scorer.
+    y : array, shape (n_samples,) | (n_samples, n_targets)
+        The target values.
+
+    Returns
+    -------
+    score : array, shape (n_tasks,)
+        The score for each task / slice of data.
+    """
+    n_tasks = X.shape[-1]
+    score = np.zeros(n_tasks)
+    for ii, est in enumerate(estimators):
+        score[ii] = scoring(est, X[..., ii], y)
+    return score
+
+
+def _check_method(estimator, method):
+    """Check that an estimator has the method attribute.
+
+    If method == 'transform'  and estimator does not have 'transform', use
+    'predict' instead.
+    """
+    if method == "transform" and not hasattr(estimator, "transform"):
+        method = "predict"
+    if not hasattr(estimator, method):
+        ValueError(f"base_estimator does not have `{method}` method.")
+    return method
+
+
+@fill_doc
+class GeneralizingEstimator(SlidingEstimator):
+    """Generalization Light.
+
+    Fit a search-light along the last dimension and use them to apply a
+    systematic cross-tasks generalization.
+
+    Parameters
+    ----------
+    %(base_estimator)s
+    %(scoring)s
+    %(n_jobs)s
+    %(position)s
+    %(allow_2d)s
+    %(verbose)s
+    """
+
+    def __repr__(self):  # noqa: D105
+        repr_str = super().__repr__()
+        if hasattr(self, "estimators_"):
+            repr_str = repr_str[:-1]
+            repr_str += f", fitted with {len(self.estimators_)} estimators>"
+        return repr_str
+
+    def _transform(self, X, method):
+        """Aux. function to make parallel predictions/transformation."""
+        X = self._check_Xy(X)
+        method = _check_method(self.base_estimator, method)
+
+        parallel, p_func, n_jobs = parallel_func(
+            _gl_transform, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+        )
+
+        context = _create_progressbar_context(self, X, "Transforming")
+        with context as pb:
+            y_pred = parallel(
+                p_func(self.estimators_, x_split, method, pb.subset(pb_idx))
+                for pb_idx, x_split in array_split_idx(
+                    X, n_jobs, axis=-1, n_per_split=len(self.estimators_)
+                )
+            )
+
+        y_pred = np.concatenate(y_pred, axis=2)
+        return y_pred
+
+    def transform(self, X):
+        """Transform each data slice with all possible estimators.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_slices)
+            The input samples. For estimator the corresponding data slice is
+            used to make a transformation. The feature dimension can be
+            multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_estimators).
+
+        Returns
+        -------
+        Xt : array, shape (n_samples, n_estimators, n_slices)
+            The transformed values generated by each estimator.
+        """
+        return self._transform(X, "transform")
+
+    def predict(self, X):
+        """Predict each data slice with all possible estimators.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_slices)
+            The training input samples. For each data slice, a fitted estimator
+            predicts each slice of the data independently. The feature
+            dimension can be multidimensional e.g.
+            X.shape = (n_samples, n_features_1, n_features_2, n_estimators).
+
+        Returns
+        -------
+        y_pred : array, shape (n_samples, n_estimators, n_slices) | (n_samples, n_estimators, n_slices, n_targets)
+            The predicted values for each estimator.
+        """  # noqa: E501
+        return self._transform(X, "predict")
+
+    def predict_proba(self, X):
+        """Estimate probabilistic estimates of each data slice with all possible estimators.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_slices)
+            The training input samples. For each data slice, a fitted estimator
+            predicts a slice of the data. The feature dimension can be
+            multidimensional e.g.
+            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.
+
+        Returns
+        -------
+        y_pred : array, shape (n_samples, n_estimators, n_slices, n_classes)
+            The predicted values for each estimator.
+
+        Notes
+        -----
+        This requires ``base_estimator`` to have a ``predict_proba`` method.
+        """  # noqa: E501
+        return self._transform(X, "predict_proba")
+
+    def decision_function(self, X):
+        """Estimate distances of each data slice to all hyperplanes.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_slices)
+            The training input samples. Each estimator outputs the distance to
+            its hyperplane, e.g.:
+            ``[estimators[ii].decision_function(X[..., ii]) for ii in range(n_estimators)]``.
+            The feature dimension can be multidimensional e.g.
+            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.
+
+        Returns
+        -------
+        y_pred : array, shape (n_samples, n_estimators, n_slices, n_classes * (n_classes-1) // 2)
+            The predicted values for each estimator.
+
+        Notes
+        -----
+        This requires ``base_estimator`` to have a ``decision_function``
+        method.
+        """  # noqa: E501
+        return self._transform(X, "decision_function")
+
+    def score(self, X, y):
+        """Score each of the estimators on the tested dimensions.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, nd_features, n_slices)
+            The input samples. For each data slice, the corresponding estimator
+            scores the prediction, e.g.:
+            ``[estimators[ii].score(X[..., ii], y) for ii in range(n_slices)]``.
+            The feature dimension can be multidimensional e.g.
+            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.
+        y : array, shape (n_samples,) | (n_samples, n_targets)
+            The target values.
+
+        Returns
+        -------
+        score : array, shape (n_samples, n_estimators, n_slices)
+            Score for each estimator / data slice couple.
+        """  # noqa: E501
+        X = self._check_Xy(X, y)
+        # For predictions/transforms the parallelization is across the data and
+        # not across the estimators to avoid memory load.
+        parallel, p_func, n_jobs = parallel_func(
+            _gl_score, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+        )
+        scoring = check_scoring(self.base_estimator, self.scoring)
+        y = _fix_auc(scoring, y)
+
+        context = _create_progressbar_context(self, X, "Scoring")
+        with context as pb:
+            score = parallel(
+                p_func(self.estimators_, scoring, x, y, pb.subset(pb_idx))
+                for pb_idx, x in array_split_idx(
+                    X, n_jobs, axis=-1, n_per_split=len(self.estimators_)
+                )
+            )
+
+        score = np.concatenate(score, axis=1)
+        return score
+
+
+def _gl_transform(estimators, X, method, pb):
+    """Transform the dataset.
+
+    This will apply each estimator to all slices of the data.
+
+    Parameters
+    ----------
+    X : array, shape (n_samples, nd_features, n_slices)
+        The training input samples. For each data slice, a clone estimator
+        is fitted independently. The feature dimension can be multidimensional
+        e.g. X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
+    method : str
+        The method to call for each estimator.
+    pb : instance of ProgressBar
+        The progress bar to update.
+
+    Returns
+    -------
+    Xt : array, shape (n_samples, n_slices)
+        The transformed values generated by each estimator.
+    """
+    n_sample, n_iter = X.shape[0], X.shape[-1]
+    for ii, est in enumerate(estimators):
+        # stack generalized data for faster prediction
+        X_stack = X.transpose(np.r_[0, X.ndim - 1, range(1, X.ndim - 1)])
+        X_stack = X_stack.reshape(np.r_[n_sample * n_iter, X_stack.shape[2:]])
+        transform = getattr(est, method)
+        _y_pred = transform(X_stack)
+        # unstack generalizations
+        if _y_pred.ndim == 2:
+            _y_pred = np.reshape(_y_pred, [n_sample, n_iter, _y_pred.shape[1]])
+        else:
+            shape = np.r_[n_sample, n_iter, _y_pred.shape[1:]].astype(int)
+            _y_pred = np.reshape(_y_pred, shape)
+        # Initialize array of predictions on the first transform iteration
+        if ii == 0:
+            y_pred = _gl_init_pred(_y_pred, X, len(estimators))
+        y_pred[:, ii, ...] = _y_pred
+
+        pb.update((ii + 1) * n_iter)
+    return y_pred
+
+
+def _gl_init_pred(y_pred, X, n_train):
+    """Aux. function to GeneralizingEstimator to initialize y_pred."""
+    n_sample, n_iter = X.shape[0], X.shape[-1]
+    if y_pred.ndim == 3:
+        y_pred = np.zeros((n_sample, n_train, n_iter, y_pred.shape[-1]), y_pred.dtype)
+    else:
+        y_pred = np.zeros((n_sample, n_train, n_iter), y_pred.dtype)
+    return y_pred
+
+
+def _gl_score(estimators, scoring, X, y, pb):
+    """Score GeneralizingEstimator in parallel.
+
+    Predict and score each slice of data.
+
+    Parameters
+    ----------
+    estimators : list of estimators
+        The fitted estimators.
+    scoring : callable, string or None
+        If scoring is None (default), the predictions are internally
+        generated by estimator.score(). Else, we must first get the
+        predictions to pass them to ad-hoc scorer.
+    X : array, shape (n_samples, nd_features, n_slices)
+        The target data. The feature dimension can be multidimensional e.g.
+        X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
+    y : array, shape (n_samples,) | (n_samples, n_targets)
+        The target values.
+    pb : instance of ProgressBar
+        The progress bar to update.
+
+    Returns
+    -------
+    score : array, shape (n_estimators, n_slices)
+        The score for each slice of data.
+    """
+    # FIXME: The level parallelization may be a bit high, and might be memory
+    # consuming. Perhaps need to lower it down to the loop across X slices.
+    score_shape = [len(estimators), X.shape[-1]]
+    for jj in range(X.shape[-1]):
+        for ii, est in enumerate(estimators):
+            _score = scoring(est, X[..., jj], y)
+            # Initialize array of predictions on the first score iteration
+            if (ii == 0) and (jj == 0):
+                dtype = type(_score)
+                score = np.zeros(score_shape, dtype)
+            score[ii, jj, ...] = _score
+
+            pb.update(jj * len(estimators) + ii + 1)
+    return score
+
+
+def _fix_auc(scoring, y):
+    # This fixes sklearn's inability to compute roc_auc when y not in [0, 1]
+    # scikit-learn/scikit-learn#6874
+    if scoring is not None:
+        score_func = getattr(scoring, "_score_func", None)
+        kwargs = getattr(scoring, "_kwargs", {})
+        if (
+            getattr(score_func, "__name__", "") == "roc_auc_score"
+            and kwargs.get("multi_class", "raise") == "raise"
+        ):
+            if np.ndim(y) != 1 or len(set(y)) != 2:
+                raise ValueError(
+                    "roc_auc scoring can only be computed for two-class problems."
+                )
+            y = LabelEncoder().fit_transform(y)
+    return y
+
+
+def _create_progressbar_context(inst, X, message):
+    """Create a progress bar taking into account ``inst.verbose``."""
+    multiply = len(inst.estimators_) if isinstance(inst, GeneralizingEstimator) else 1
+    n_steps = X.shape[-1] * max(1, multiply)
+    mesg = f"{message} {inst.__class__.__name__}"
+
+    which_tqdm = "off" if not _check_verbose(inst.verbose) else None
+    context = ProgressBar(
+        n_steps, mesg=mesg, position=inst.position, which_tqdm=which_tqdm
+    )
+
+    return context
+
+
+def _check_verbose(verbose):
+    """Check if verbose is above or equal 'INFO' level."""
+    logging_level = _parse_verbose(verbose)
+    bool_verbose = logging_level <= logging.INFO
+    return bool_verbose
--- a/mne/decoding/ssd.py
+++ b/mne/decoding/ssd.py
@@ -0,0 +1,419 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import numpy as np
+from scipy.linalg import eigh
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from .._fiff.pick import _picks_to_idx
+from ..cov import Covariance, _regularized_covariance
+from ..defaults import _handle_default
+from ..filter import filter_data
+from ..rank import compute_rank
+from ..time_frequency import psd_array_welch
+from ..utils import (
+    _check_option,
+    _time_mask,
+    _validate_type,
+    _verbose_safe_false,
+    fill_doc,
+    logger,
+)
+
+
+@fill_doc
+class SSD(TransformerMixin, BaseEstimator):
+    """
+    Signal decomposition using the Spatio-Spectral Decomposition (SSD).
+
+    SSD seeks to maximize the power at a frequency band of interest while
+    simultaneously minimizing it at the flanking (surrounding) frequency bins
+    (considered noise). It extremizes the covariance matrices associated with
+    signal and noise :footcite:`NikulinEtAl2011`.
+
+    SSD can either be used as a dimensionality reduction method or a
+    ‘denoised’ low rank factorization method :footcite:`HaufeEtAl2014b`.
+
+    Parameters
+    ----------
+    %(info_not_none)s Must match the input data.
+    filt_params_signal : dict
+        Filtering for the frequencies of interest.
+    filt_params_noise : dict
+        Filtering for the frequencies of non-interest.
+    reg : float | str | None (default)
+        Which covariance estimator to use.
+        If not None (same as 'empirical'), allow regularization for covariance
+        estimation. If float, shrinkage is used (0 <= shrinkage <= 1). For str
+        options, reg will be passed to method :func:`mne.compute_covariance`.
+    n_components : int | None (default None)
+        The number of components to extract from the signal.
+        If None, the number of components equal to the rank of the data are
+        returned (see ``rank``).
+    picks : array of int | None (default None)
+        The indices of good channels.
+    sort_by_spectral_ratio : bool (default True)
+        If set to True, the components are sorted according to the spectral
+        ratio.
+        See Eq. (24) in :footcite:`NikulinEtAl2011`.
+    return_filtered : bool (default False)
+        If return_filtered is True, data is bandpassed and projected onto the
+        SSD components.
+    n_fft : int (default None)
+       If sort_by_spectral_ratio is set to True, then the SSD sources will be
+       sorted according to their spectral ratio which is calculated based on
+       :func:`mne.time_frequency.psd_array_welch`. The n_fft parameter sets the
+       length of FFT used.
+       See :func:`mne.time_frequency.psd_array_welch` for more information.
+    cov_method_params : dict | None (default None)
+        As in :class:`mne.decoding.SPoC`
+        The default is None.
+    rank : None | dict | ‘info’ | ‘full’
+        As in :class:`mne.decoding.SPoC`
+        This controls the rank computation that can be read from the
+        measurement info or estimated from the data, which determines the
+        maximum possible number of components.
+        See Notes of :func:`mne.compute_rank` for details.
+        We recommend to use 'full' when working with epoched data.
+
+    Attributes
+    ----------
+    filters_ : array, shape (n_channels, n_components)
+        The spatial filters to be multiplied with the signal.
+    patterns_ : array, shape (n_components, n_channels)
+        The patterns for reconstructing the signal from the filtered data.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+
+    def __init__(
+        self,
+        info,
+        filt_params_signal,
+        filt_params_noise,
+        reg=None,
+        n_components=None,
+        picks=None,
+        sort_by_spectral_ratio=True,
+        return_filtered=False,
+        n_fft=None,
+        cov_method_params=None,
+        rank=None,
+    ):
+        """Initialize instance."""
+        dicts = {"signal": filt_params_signal, "noise": filt_params_noise}
+        for param, dd in [("l", 0), ("h", 0), ("l", 1), ("h", 1)]:
+            key = ("signal", "noise")[dd]
+            if param + "_freq" not in dicts[key]:
+                raise ValueError(
+                    f"{param + '_freq'} must be defined in filter parameters for {key}"
+                )
+            val = dicts[key][param + "_freq"]
+            if not isinstance(val, int | float):
+                _validate_type(val, ("numeric",), f"{key} {param}_freq")
+        # check freq bands
+        if (
+            filt_params_noise["l_freq"] > filt_params_signal["l_freq"]
+            or filt_params_signal["h_freq"] > filt_params_noise["h_freq"]
+        ):
+            raise ValueError(
+                "Wrongly specified frequency bands!\n"
+                "The signal band-pass must be within the noise "
+                "band-pass!"
+            )
+        self.picks = picks
+        del picks
+        self.info = info
+        self.freqs_signal = (filt_params_signal["l_freq"], filt_params_signal["h_freq"])
+        self.freqs_noise = (filt_params_noise["l_freq"], filt_params_noise["h_freq"])
+        self.filt_params_signal = filt_params_signal
+        self.filt_params_noise = filt_params_noise
+        # check if boolean
+        if not isinstance(sort_by_spectral_ratio, (bool)):
+            raise ValueError("sort_by_spectral_ratio must be boolean")
+        self.sort_by_spectral_ratio = sort_by_spectral_ratio
+        if n_fft is None:
+            self.n_fft = int(self.info["sfreq"])
+        else:
+            self.n_fft = int(n_fft)
+        # check if boolean
+        if not isinstance(return_filtered, (bool)):
+            raise ValueError("return_filtered must be boolean")
+        self.return_filtered = return_filtered
+        self.reg = reg
+        self.n_components = n_components
+        self.rank = rank
+        self.cov_method_params = cov_method_params
+
+    def _check_X(self, X):
+        """Check input data."""
+        _validate_type(X, np.ndarray, "X")
+        _check_option("X.ndim", X.ndim, (2, 3))
+        n_chan = X.shape[-2]
+        if n_chan != self.info["nchan"]:
+            raise ValueError(
+                "Info must match the input data."
+                f"Found {n_chan} channels but expected {self.info['nchan']}."
+            )
+
+    def fit(self, X, y=None):
+        """Estimate the SSD decomposition on raw or epoched data.
+
+        Parameters
+        ----------
+        X : array, shape ([n_epochs, ]n_channels, n_times)
+            The input data from which to estimate the SSD. Either 2D array
+            obtained from continuous data or 3D array obtained from epoched
+            data.
+        y : None
+            Ignored; exists for compatibility with scikit-learn pipelines.
+
+        Returns
+        -------
+        self : instance of SSD
+            Returns the modified instance.
+        """
+        ch_types = self.info.get_channel_types(picks=self.picks, unique=True)
+        if len(ch_types) > 1:
+            raise ValueError(
+                "At this point SSD only supports fitting "
+                f"single channel types. Your info has {len(ch_types)} types."
+            )
+        self.picks_ = _picks_to_idx(self.info, self.picks, none="data", exclude="bads")
+        self._check_X(X)
+        X_aux = X[..., self.picks_, :]
+
+        X_signal = filter_data(X_aux, self.info["sfreq"], **self.filt_params_signal)
+        X_noise = filter_data(X_aux, self.info["sfreq"], **self.filt_params_noise)
+        X_noise -= X_signal
+        if X.ndim == 3:
+            X_signal = np.hstack(X_signal)
+            X_noise = np.hstack(X_noise)
+
+        # prevent rank change when computing cov with rank='full'
+        cov_signal = _regularized_covariance(
+            X_signal,
+            reg=self.reg,
+            method_params=self.cov_method_params,
+            rank="full",
+            info=self.info,
+        )
+        cov_noise = _regularized_covariance(
+            X_noise,
+            reg=self.reg,
+            method_params=self.cov_method_params,
+            rank="full",
+            info=self.info,
+        )
+
+        # project cov to rank subspace
+        cov_signal, cov_noise, rank_proj = _dimensionality_reduction(
+            cov_signal, cov_noise, self.info, self.rank
+        )
+
+        eigvals_, eigvects_ = eigh(cov_signal, cov_noise)
+        # sort in descending order
+        ix = np.argsort(eigvals_)[::-1]
+        self.eigvals_ = eigvals_[ix]
+        # project back to sensor space
+        self.filters_ = np.matmul(rank_proj, eigvects_[:, ix])
+        self.patterns_ = np.linalg.pinv(self.filters_)
+
+        # We assume that ordering by spectral ratio is more important
+        # than the initial ordering. This ordering should be also learned when
+        # fitting.
+        X_ssd = self.filters_.T @ X[..., self.picks_, :]
+        sorter_spec = Ellipsis
+        if self.sort_by_spectral_ratio:
+            _, sorter_spec = self.get_spectral_ratio(ssd_sources=X_ssd)
+        self.sorter_spec = sorter_spec
+        logger.info("Done.")
+        return self
+
+    def transform(self, X):
+        """Estimate epochs sources given the SSD filters.
+
+        Parameters
+        ----------
+        X : array, shape ([n_epochs, ]n_channels, n_times)
+            The input data from which to estimate the SSD. Either 2D array
+            obtained from continuous data or 3D array obtained from epoched
+            data.
+
+        Returns
+        -------
+        X_ssd : array, shape ([n_epochs, ]n_components, n_times)
+            The processed data.
+        """
+        self._check_X(X)
+        if self.filters_ is None:
+            raise RuntimeError("No filters available. Please first call fit")
+        if self.return_filtered:
+            X_aux = X[..., self.picks_, :]
+            X = filter_data(X_aux, self.info["sfreq"], **self.filt_params_signal)
+        X_ssd = self.filters_.T @ X[..., self.picks_, :]
+        if X.ndim == 2:
+            X_ssd = X_ssd[self.sorter_spec][: self.n_components]
+        else:
+            X_ssd = X_ssd[:, self.sorter_spec, :][:, : self.n_components, :]
+        return X_ssd
+
+    def fit_transform(self, X, y=None, **fit_params):
+        """Fit SSD to data, then transform it.
+
+        Fits transformer to ``X`` and ``y`` with optional parameters ``fit_params``, and
+        returns a transformed version of ``X``.
+
+        Parameters
+        ----------
+        X : array, shape ([n_epochs, ]n_channels, n_times)
+            The input data from which to estimate the SSD. Either 2D array obtained from
+            continuous data or 3D array obtained from epoched data.
+        y : None
+            Ignored; exists for compatibility with scikit-learn pipelines.
+        **fit_params : dict
+            Additional fitting parameters passed to the :meth:`mne.decoding.SSD.fit`
+            method. Not used for this class.
+
+        Returns
+        -------
+        X_ssd : array, shape ([n_epochs, ]n_components, n_times)
+            The processed data.
+        """
+        # use parent TransformerMixin method but with custom docstring
+        return super().fit_transform(X, y=y, **fit_params)
+
+    def get_spectral_ratio(self, ssd_sources):
+        """Get the spectal signal-to-noise ratio for each spatial filter.
+
+        Spectral ratio measure for best n_components selection
+        See :footcite:`NikulinEtAl2011`, Eq. (24).
+
+        Parameters
+        ----------
+        ssd_sources : array
+            Data projected to SSD space.
+
+        Returns
+        -------
+        spec_ratio : array, shape (n_channels)
+            Array with the sprectal ratio value for each component.
+        sorter_spec : array, shape (n_channels)
+            Array of indices for sorting spec_ratio.
+
+        References
+        ----------
+        .. footbibliography::
+        """
+        psd, freqs = psd_array_welch(
+            ssd_sources, sfreq=self.info["sfreq"], n_fft=self.n_fft
+        )
+        sig_idx = _time_mask(freqs, *self.freqs_signal)
+        noise_idx = _time_mask(freqs, *self.freqs_noise)
+        if psd.ndim == 3:
+            mean_sig = psd[:, :, sig_idx].mean(axis=2).mean(axis=0)
+            mean_noise = psd[:, :, noise_idx].mean(axis=2).mean(axis=0)
+            spec_ratio = mean_sig / mean_noise
+        else:
+            mean_sig = psd[:, sig_idx].mean(axis=1)
+            mean_noise = psd[:, noise_idx].mean(axis=1)
+            spec_ratio = mean_sig / mean_noise
+        sorter_spec = spec_ratio.argsort()[::-1]
+        return spec_ratio, sorter_spec
+
+    def inverse_transform(self):
+        """Not implemented yet."""
+        raise NotImplementedError("inverse_transform is not yet available.")
+
+    def apply(self, X):
+        """Remove selected components from the signal.
+
+        This procedure will reconstruct M/EEG signals from which the dynamics
+        described by the excluded components is subtracted
+        (denoised by low-rank factorization).
+        See :footcite:`HaufeEtAl2014b` for more information.
+
+        .. note:: Unlike in other classes with an apply method,
+           only NumPy arrays are supported (not instances of MNE objects).
+
+        Parameters
+        ----------
+        X : array, shape ([n_epochs, ]n_channels, n_times)
+            The input data from which to estimate the SSD. Either 2D array
+            obtained from continuous data or 3D array obtained from epoched
+            data.
+
+        Returns
+        -------
+        X : array, shape ([n_epochs, ]n_channels, n_times)
+            The processed data.
+        """
+        X_ssd = self.transform(X)
+        pick_patterns = self.patterns_[self.sorter_spec][: self.n_components].T
+        X = pick_patterns @ X_ssd
+        return X
+
+
+def _dimensionality_reduction(cov_signal, cov_noise, info, rank):
+    """Perform dimensionality reduction on the covariance matrices."""
+    n_channels = cov_signal.shape[0]
+
+    # find ranks of covariance matrices
+    rank_signal = list(
+        compute_rank(
+            Covariance(
+                cov_signal,
+                info.ch_names,
+                list(),
+                list(),
+                0,
+                verbose=_verbose_safe_false(),
+            ),
+            rank,
+            _handle_default("scalings_cov_rank", None),
+            info,
+        ).values()
+    )[0]
+    rank_noise = list(
+        compute_rank(
+            Covariance(
+                cov_noise,
+                info.ch_names,
+                list(),
+                list(),
+                0,
+                verbose=_verbose_safe_false(),
+            ),
+            rank,
+            _handle_default("scalings_cov_rank", None),
+            info,
+        ).values()
+    )[0]
+    rank = np.min([rank_signal, rank_noise])  # should be identical
+
+    if rank < n_channels:
+        eigvals, eigvects = eigh(cov_signal)
+        # sort in descending order
+        ix = np.argsort(eigvals)[::-1]
+        eigvals = eigvals[ix]
+        eigvects = eigvects[:, ix]
+        # compute rank subspace projection matrix
+        rank_proj = np.matmul(
+            eigvects[:, :rank], np.eye(rank) * (eigvals[:rank] ** -0.5)
+        )
+        logger.info(
+            "Projecting covariance of %i channels to %i rank subspace",
+            n_channels,
+            rank,
+        )
+    else:
+        rank_proj = np.eye(n_channels)
+        logger.info("Preserving covariance rank (%i)", rank)
+
+    # project covariance matrices to rank subspace
+    cov_signal = np.matmul(rank_proj.T, np.matmul(cov_signal, rank_proj))
+    cov_noise = np.matmul(rank_proj.T, np.matmul(cov_noise, rank_proj))
+    return cov_signal, cov_noise, rank_proj
--- a/mne/decoding/time_delaying_ridge.py
+++ b/mne/decoding/time_delaying_ridge.py
@@ -0,0 +1,395 @@
+"""TimeDelayingRidge class."""
+
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import numpy as np
+from scipy import linalg
+from scipy.signal import fftconvolve
+from scipy.sparse.csgraph import laplacian
+from sklearn.base import BaseEstimator, RegressorMixin
+
+from ..cuda import _setup_cuda_fft_multiply_repeated
+from ..filter import next_fast_len
+from ..fixes import jit
+from ..utils import ProgressBar, _check_option, _validate_type, logger, warn
+
+
+def _compute_corrs(
+    X, y, smin, smax, n_jobs=None, fit_intercept=False, edge_correction=True
+):
+    """Compute auto- and cross-correlations."""
+    if fit_intercept:
+        # We could do this in the Fourier domain, too, but it should
+        # be a bit cleaner numerically to do it here.
+        X_offset = np.mean(X, axis=0)
+        y_offset = np.mean(y, axis=0)
+        if X.ndim == 3:
+            X_offset = X_offset.mean(axis=0)
+            y_offset = np.mean(y_offset, axis=0)
+        X = X - X_offset
+        y = y - y_offset
+    else:
+        X_offset = y_offset = 0.0
+    if X.ndim == 2:
+        assert y.ndim == 2
+        X = X[:, np.newaxis, :]
+        y = y[:, np.newaxis, :]
+    assert X.shape[:2] == y.shape[:2]
+    len_trf = smax - smin
+    len_x, n_epochs, n_ch_x = X.shape
+    len_y, n_epochs_y, n_ch_y = y.shape
+    assert len_x == len_y
+    assert n_epochs == n_epochs_y
+
+    n_fft = next_fast_len(2 * X.shape[0] - 1)
+
+    _, cuda_dict = _setup_cuda_fft_multiply_repeated(
+        n_jobs, [1.0], n_fft, "correlation calculations"
+    )
+    del n_jobs  # only used to set as CUDA
+
+    # create our Toeplitz indexer
+    ij = np.empty((len_trf, len_trf), int)
+    for ii in range(len_trf):
+        ij[ii, ii:] = np.arange(len_trf - ii)
+        x = np.arange(n_fft - 1, n_fft - len_trf + ii, -1)
+        ij[ii + 1 :, ii] = x
+
+    x_xt = np.zeros([n_ch_x * len_trf] * 2)
+    x_y = np.zeros((len_trf, n_ch_x, n_ch_y), order="F")
+    n = n_epochs * (n_ch_x * (n_ch_x + 1) // 2 + n_ch_x)
+    logger.info(f"Fitting {n_epochs} epochs, {n_ch_x} channels")
+    pb = ProgressBar(n, mesg="Sample")
+    count = 0
+    pb.update(count)
+    for ei in range(n_epochs):
+        this_X = X[:, ei, :]
+        # XXX maybe this is what we should parallelize over CPUs at some point
+        X_fft = cuda_dict["rfft"](this_X, n=n_fft, axis=0)
+        X_fft_conj = X_fft.conj()
+        y_fft = cuda_dict["rfft"](y[:, ei, :], n=n_fft, axis=0)
+
+        for ch0 in range(n_ch_x):
+            for oi, ch1 in enumerate(range(ch0, n_ch_x)):
+                this_result = cuda_dict["irfft"](
+                    X_fft[:, ch0] * X_fft_conj[:, ch1], n=n_fft, axis=0
+                )
+                # Our autocorrelation structure is a Toeplitz matrix, but
+                # it's faster to create the Toeplitz ourselves than use
+                # linalg.toeplitz.
+                this_result = this_result[ij]
+                # However, we need to adjust for coeffs that are cut off,
+                # i.e. the non-zero delays should not have the same AC value
+                # as the zero-delay ones (because they actually have fewer
+                # coefficients).
+                #
+                # These adjustments also follow a Toeplitz structure, so we
+                # construct a matrix of what has been left off, compute their
+                # inner products, and remove them.
+                if edge_correction:
+                    _edge_correct(this_result, this_X, smax, smin, ch0, ch1)
+
+                # Store the results in our output matrix
+                x_xt[
+                    ch0 * len_trf : (ch0 + 1) * len_trf,
+                    ch1 * len_trf : (ch1 + 1) * len_trf,
+                ] += this_result
+                if ch0 != ch1:
+                    x_xt[
+                        ch1 * len_trf : (ch1 + 1) * len_trf,
+                        ch0 * len_trf : (ch0 + 1) * len_trf,
+                    ] += this_result.T
+                count += 1
+                pb.update(count)
+
+            # compute the crosscorrelations
+            cc_temp = cuda_dict["irfft"](
+                y_fft * X_fft_conj[:, slice(ch0, ch0 + 1)], n=n_fft, axis=0
+            )
+            if smin < 0 and smax >= 0:
+                x_y[:-smin, ch0] += cc_temp[smin:]
+                x_y[len_trf - smax :, ch0] += cc_temp[:smax]
+            else:
+                x_y[:, ch0] += cc_temp[smin:smax]
+            count += 1
+            pb.update(count)
+
+    x_y = np.reshape(x_y, (n_ch_x * len_trf, n_ch_y), order="F")
+    return x_xt, x_y, n_ch_x, X_offset, y_offset
+
+
+@jit()
+def _edge_correct(this_result, this_X, smax, smin, ch0, ch1):
+    if smax > 0:
+        tail = _toeplitz_dot(this_X[-1:-smax:-1, ch0], this_X[-1:-smax:-1, ch1])
+        if smin > 0:
+            tail = tail[smin - 1 :, smin - 1 :]
+        this_result[max(-smin + 1, 0) :, max(-smin + 1, 0) :] -= tail
+    if smin < 0:
+        head = _toeplitz_dot(this_X[:-smin, ch0], this_X[:-smin, ch1])[::-1, ::-1]
+        if smax < 0:
+            head = head[:smax, :smax]
+        this_result[:-smin, :-smin] -= head
+
+
+@jit()
+def _toeplitz_dot(a, b):
+    """Create upper triangular Toeplitz matrices & compute the dot product."""
+    # This is equivalent to:
+    # a = linalg.toeplitz(a)
+    # b = linalg.toeplitz(b)
+    # a[np.triu_indices(len(a), 1)] = 0
+    # b[np.triu_indices(len(a), 1)] = 0
+    # out = np.dot(a.T, b)
+    assert a.shape == b.shape and a.ndim == 1
+    out = np.outer(a, b)
+    for ii in range(1, len(a)):
+        out[ii, ii:] += out[ii - 1, ii - 1 : -1]
+        out[ii + 1 :, ii] += out[ii:-1, ii - 1]
+    return out
+
+
+def _compute_reg_neighbors(n_ch_x, n_delays, reg_type, method="direct", normed=False):
+    """Compute regularization parameter from neighbors."""
+    known_types = ("ridge", "laplacian")
+    if isinstance(reg_type, str):
+        reg_type = (reg_type,) * 2
+    if len(reg_type) != 2:
+        raise ValueError(f"reg_type must have two elements, got {len(reg_type)}")
+    for r in reg_type:
+        if r not in known_types:
+            raise ValueError(f"reg_type entries must be one of {known_types}, got {r}")
+    reg_time = reg_type[0] == "laplacian" and n_delays > 1
+    reg_chs = reg_type[1] == "laplacian" and n_ch_x > 1
+    if not reg_time and not reg_chs:
+        return np.eye(n_ch_x * n_delays)
+    # regularize time
+    if reg_time:
+        reg = np.eye(n_delays)
+        stride = n_delays + 1
+        reg.flat[1::stride] += -1
+        reg.flat[n_delays::stride] += -1
+        reg.flat[n_delays + 1 : -n_delays - 1 : stride] += 1
+        args = [reg] * n_ch_x
+        reg = linalg.block_diag(*args)
+    else:
+        reg = np.zeros((n_delays * n_ch_x,) * 2)
+
+    # regularize features
+    if reg_chs:
+        block = n_delays * n_delays
+        row_offset = block * n_ch_x
+        stride = n_delays * n_ch_x + 1
+        reg.flat[n_delays:-row_offset:stride] += -1
+        reg.flat[n_delays + row_offset :: stride] += 1
+        reg.flat[row_offset:-n_delays:stride] += -1
+        reg.flat[: -(n_delays + row_offset) : stride] += 1
+    assert np.array_equal(reg[::-1, ::-1], reg)
+
+    if method == "direct":
+        if normed:
+            norm = np.sqrt(np.diag(reg))
+            reg /= norm
+            reg /= norm[:, np.newaxis]
+        return reg
+    else:
+        # Use csgraph. Note that our -1's above are really the neighbors!
+        # If we ever want to allow arbitrary adjacency matrices, this is how
+        # we'd want to do it.
+        reg = laplacian(-reg, normed=normed)
+    return reg
+
+
+def _fit_corrs(x_xt, x_y, n_ch_x, reg_type, alpha, n_ch_in):
+    """Fit the model using correlation matrices."""
+    # do the regularized solving
+    n_ch_out = x_y.shape[1]
+    assert x_y.shape[0] % n_ch_x == 0
+    n_delays = x_y.shape[0] // n_ch_x
+    reg = _compute_reg_neighbors(n_ch_x, n_delays, reg_type)
+    mat = x_xt + alpha * reg
+    # From sklearn
+    try:
+        # Note: we must use overwrite_a=False in order to be able to
+        #       use the fall-back solution below in case a LinAlgError
+        #       is raised
+        w = linalg.solve(mat, x_y, overwrite_a=False, assume_a="pos")
+    except np.linalg.LinAlgError:
+        warn(
+            "Singular matrix in solving dual problem. Using "
+            "least-squares solution instead."
+        )
+        w = linalg.lstsq(mat, x_y, lapack_driver="gelsy")[0]
+    w = w.T.reshape([n_ch_out, n_ch_in, n_delays])
+    return w
+
+
+class TimeDelayingRidge(RegressorMixin, BaseEstimator):
+    """Ridge regression of data with time delays.
+
+    Parameters
+    ----------
+    tmin : int | float
+        The starting lag, in seconds (or samples if ``sfreq`` == 1).
+        Negative values correspond to times in the past.
+    tmax : int | float
+        The ending lag, in seconds (or samples if ``sfreq`` == 1).
+        Positive values correspond to times in the future.
+        Must be >= tmin.
+    sfreq : float
+        The sampling frequency used to convert times into samples.
+    alpha : float
+        The ridge (or laplacian) regularization factor.
+    reg_type : str | list
+        Can be ``"ridge"`` (default) or ``"laplacian"``.
+        Can also be a 2-element list specifying how to regularize in time
+        and across adjacent features.
+    fit_intercept : bool
+        If True (default), the sample mean is removed before fitting.
+    n_jobs : int | str
+        The number of jobs to use. Can be an int (default 1) or ``'cuda'``.
+
+        .. versionadded:: 0.18
+    edge_correction : bool
+        If True (default), correct the autocorrelation coefficients for
+        non-zero delays for the fact that fewer samples are available.
+        Disabling this speeds up performance at the cost of accuracy
+        depending on the relationship between epoch length and model
+        duration. Only used if ``estimator`` is float or None.
+
+        .. versionadded:: 0.18
+
+    See Also
+    --------
+    mne.decoding.ReceptiveField
+
+    Notes
+    -----
+    This class is meant to be used with :class:`mne.decoding.ReceptiveField`
+    by only implicitly doing the time delaying. For reasonable receptive
+    field and input signal sizes, it should be more CPU and memory
+    efficient by using frequency-domain methods (FFTs) to compute the
+    auto- and cross-correlations.
+    """
+
+    _estimator_type = "regressor"
+
+    def __init__(
+        self,
+        tmin,
+        tmax,
+        sfreq,
+        alpha=0.0,
+        reg_type="ridge",
+        fit_intercept=True,
+        n_jobs=None,
+        edge_correction=True,
+    ):
+        self.tmin = tmin
+        self.tmax = tmax
+        self.sfreq = sfreq
+        self.alpha = alpha
+        self.reg_type = reg_type
+        self.fit_intercept = fit_intercept
+        self.edge_correction = edge_correction
+        self.n_jobs = n_jobs
+
+    @property
+    def _smin(self):
+        return int(round(self.tmin_ * self.sfreq_))
+
+    @property
+    def _smax(self):
+        return int(round(self.tmax_ * self.sfreq_)) + 1
+
+    def fit(self, X, y):
+        """Estimate the coefficients of the linear model.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples[, n_epochs], n_features)
+            The training input samples to estimate the linear coefficients.
+        y : array, shape (n_samples[, n_epochs],  n_outputs)
+            The target values.
+
+        Returns
+        -------
+        self : instance of TimeDelayingRidge
+            Returns the modified instance.
+        """
+        _validate_type(X, "array-like", "X")
+        _validate_type(y, "array-like", "y")
+        self.tmin_ = float(self.tmin)
+        self.tmax_ = float(self.tmax)
+        self.sfreq_ = float(self.sfreq)
+        self.alpha_ = float(self.alpha)
+        if self.tmin_ > self.tmax_:
+            raise ValueError(f"tmin must be <= tmax, got {self.tmin_} and {self.tmax_}")
+        X = np.asarray(X, dtype=float)
+        y = np.asarray(y, dtype=float)
+        if X.ndim == 3:
+            assert y.ndim == 3
+            assert X.shape[:2] == y.shape[:2]
+        else:
+            if X.ndim == 1:
+                X = X[:, np.newaxis]
+            if y.ndim == 1:
+                y = y[:, np.newaxis]
+            assert X.ndim == 2
+            assert y.ndim == 2
+        _check_option("y.shape[0]", y.shape[0], (X.shape[0],))
+        # These are split into two functions because it's possible that we
+        # might want to allow people to do them separately (e.g., to test
+        # different regularization parameters).
+        self.cov_, x_y_, n_ch_x, X_offset, y_offset = _compute_corrs(
+            X,
+            y,
+            self._smin,
+            self._smax,
+            self.n_jobs,
+            self.fit_intercept,
+            self.edge_correction,
+        )
+        self.coef_ = _fit_corrs(
+            self.cov_, x_y_, n_ch_x, self.reg_type, self.alpha_, n_ch_x
+        )
+        # This is the sklearn formula from LinearModel (will be 0. for no fit)
+        if self.fit_intercept:
+            self.intercept_ = y_offset - np.dot(X_offset, self.coef_.sum(-1).T)
+        else:
+            self.intercept_ = 0.0
+        return self
+
+    def predict(self, X):
+        """Predict the output.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples[, n_epochs], n_features)
+            The data.
+
+        Returns
+        -------
+        X : ndarray
+            The predicted response.
+        """
+        if X.ndim == 2:
+            X = X[:, np.newaxis, :]
+            singleton = True
+        else:
+            singleton = False
+        out = np.zeros(X.shape[:2] + (self.coef_.shape[0],))
+        smin = self._smin
+        offset = max(smin, 0)
+        for ei in range(X.shape[1]):
+            for oi in range(self.coef_.shape[0]):
+                for fi in range(self.coef_.shape[1]):
+                    temp = fftconvolve(X[:, ei, fi], self.coef_[oi, fi])
+                    temp = temp[max(-smin, 0) :][: len(out) - offset]
+                    out[offset : len(temp) + offset, ei, oi] += temp
+        out += self.intercept_
+        if singleton:
+            out = out[:, 0, :]
+        return out
--- a/mne/decoding/time_frequency.py
+++ b/mne/decoding/time_frequency.py
@@ -0,0 +1,168 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from ..time_frequency.tfr import _compute_tfr
+from ..utils import _check_option, fill_doc, verbose
+
+
+@fill_doc
+class TimeFrequency(TransformerMixin, BaseEstimator):
+    """Time frequency transformer.
+
+    Time-frequency transform of times series along the last axis.
+
+    Parameters
+    ----------
+    freqs : array-like of float, shape (n_freqs,)
+        The frequencies.
+    sfreq : float | int, default 1.0
+        Sampling frequency of the data.
+    method : 'multitaper' | 'morlet', default 'morlet'
+        The time-frequency method. 'morlet' convolves a Morlet wavelet.
+        'multitaper' uses Morlet wavelets windowed with multiple DPSS
+        multitapers.
+    n_cycles : float | array of float, default 7.0
+        Number of cycles  in the Morlet wavelet. Fixed number
+        or one per frequency.
+    time_bandwidth : float, default None
+        If None and method=multitaper, will be set to 4.0 (3 tapers).
+        Time x (Full) Bandwidth product. Only applies if
+        method == 'multitaper'. The number of good tapers (low-bias) is
+        chosen automatically based on this to equal floor(time_bandwidth - 1).
+    use_fft : bool, default True
+        Use the FFT for convolutions or not.
+    decim : int | slice, default 1
+        To reduce memory usage, decimation factor after time-frequency
+        decomposition.
+        If `int`, returns tfr[..., ::decim].
+        If `slice`, returns tfr[..., decim].
+
+        .. note:: Decimation may create aliasing artifacts, yet decimation
+                  is done after the convolutions.
+
+    output : str, default 'complex'
+        * 'complex' : single trial complex.
+        * 'power' : single trial power.
+        * 'phase' : single trial phase.
+    %(n_jobs)s
+        The number of epochs to process at the same time. The parallelization
+        is implemented across channels.
+    %(verbose)s
+
+    See Also
+    --------
+    mne.time_frequency.tfr_morlet
+    mne.time_frequency.tfr_multitaper
+    """
+
+    @verbose
+    def __init__(
+        self,
+        freqs,
+        sfreq=1.0,
+        method="morlet",
+        n_cycles=7.0,
+        time_bandwidth=None,
+        use_fft=True,
+        decim=1,
+        output="complex",
+        n_jobs=1,
+        verbose=None,
+    ):
+        """Init TimeFrequency transformer."""
+        # Check non-average output
+        output = _check_option("output", output, ["complex", "power", "phase"])
+
+        self.freqs = freqs
+        self.sfreq = sfreq
+        self.method = method
+        self.n_cycles = n_cycles
+        self.time_bandwidth = time_bandwidth
+        self.use_fft = use_fft
+        self.decim = decim
+        # Check that output is not an average metric (e.g. ITC)
+        self.output = output
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+
+    def fit_transform(self, X, y=None):
+        """Time-frequency transform of times series along the last axis.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_channels, n_times)
+            The training data samples. The channel dimension can be zero- or
+            1-dimensional.
+        y : None
+            For scikit-learn compatibility purposes.
+
+        Returns
+        -------
+        Xt : array, shape (n_samples, n_channels, n_freqs, n_times)
+            The time-frequency transform of the data, where n_channels can be
+            zero- or 1-dimensional.
+        """
+        return self.fit(X, y).transform(X)
+
+    def fit(self, X, y=None):  # noqa: D401
+        """Do nothing (for scikit-learn compatibility purposes).
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_channels, n_times)
+            The training data.
+        y : array | None
+            The target values.
+
+        Returns
+        -------
+        self : object
+            Return self.
+        """
+        return self
+
+    def transform(self, X):
+        """Time-frequency transform of times series along the last axis.
+
+        Parameters
+        ----------
+        X : array, shape (n_samples, n_channels, n_times)
+            The training data samples. The channel dimension can be zero- or
+            1-dimensional.
+
+        Returns
+        -------
+        Xt : array, shape (n_samples, n_channels, n_freqs, n_times)
+            The time-frequency transform of the data, where n_channels can be
+            zero- or 1-dimensional.
+        """
+        # Ensure 3-dimensional X
+        shape = X.shape[1:-1]
+        if not shape:
+            X = X[:, np.newaxis, :]
+
+        # Compute time-frequency
+        Xt = _compute_tfr(
+            X,
+            freqs=self.freqs,
+            sfreq=self.sfreq,
+            method=self.method,
+            n_cycles=self.n_cycles,
+            zero_mean=True,
+            time_bandwidth=self.time_bandwidth,
+            use_fft=self.use_fft,
+            decim=self.decim,
+            output=self.output,
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+        )
+
+        # Back to original shape
+        if not shape:
+            Xt = Xt[:, 0, :]
+
+        return Xt
--- a/mne/decoding/transformer.py
+++ b/mne/decoding/transformer.py
@@ -0,0 +1,920 @@
+# Authors: The MNE-Python contributors.
+# License: BSD-3-Clause
+# Copyright the MNE-Python contributors.
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from .._fiff.pick import (
+    _pick_data_channels,
+    _picks_by_type,
+    _picks_to_idx,
+    pick_info,
+    pick_types,
+)
+from ..cov import _check_scalings_user
+from ..filter import filter_data
+from ..time_frequency import psd_array_multitaper
+from ..utils import _check_option, _validate_type, fill_doc, verbose
+
+
+class _ConstantScaler:
+    """Scale channel types using constant values."""
+
+    def __init__(self, info, scalings, do_scaling=True):
+        self._scalings = scalings
+        self._info = info
+        self._do_scaling = do_scaling
+
+    def fit(self, X, y=None):
+        scalings = _check_scalings_user(self._scalings)
+        picks_by_type = _picks_by_type(
+            pick_info(self._info, _pick_data_channels(self._info, exclude=()))
+        )
+        std = np.ones(sum(len(p[1]) for p in picks_by_type))
+        if X.shape[1] != len(std):
+            raise ValueError(
+                f"info had {len(std)} data channels but X has {len(X)} channels"
+            )
+        if self._do_scaling:  # this is silly, but necessary for completeness
+            for kind, picks in picks_by_type:
+                std[picks] = 1.0 / scalings[kind]
+        self.std_ = std
+        self.mean_ = np.zeros_like(std)
+        return self
+
+    def transform(self, X):
+        return X / self.std_
+
+    def inverse_transform(self, X, y=None):
+        return X * self.std_
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X)
+
+
+def _sklearn_reshape_apply(func, return_result, X, *args, **kwargs):
+    """Reshape epochs and apply function."""
+    if not isinstance(X, np.ndarray):
+        raise ValueError(f"data should be an np.ndarray, got {type(X)}.")
+    orig_shape = X.shape
+    X = np.reshape(X.transpose(0, 2, 1), (-1, orig_shape[1]))
+    X = func(X, *args, **kwargs)
+    if return_result:
+        X.shape = (orig_shape[0], orig_shape[2], orig_shape[1])
+        X = X.transpose(0, 2, 1)
+        return X
+
+
+@fill_doc
+class Scaler(TransformerMixin, BaseEstimator):
+    """Standardize channel data.
+
+    This class scales data for each channel. It differs from scikit-learn
+    classes (e.g., :class:`sklearn.preprocessing.StandardScaler`) in that
+    it scales each *channel* by estimating μ and σ using data from all
+    time points and epochs, as opposed to standardizing each *feature*
+    (i.e., each time point for each channel) by estimating using μ and σ
+    using data from all epochs.
+
+    Parameters
+    ----------
+    %(info)s Only necessary if ``scalings`` is a dict or None.
+    scalings : dict, str, default None
+        Scaling method to be applied to data channel wise.
+
+        * if scalings is None (default), scales mag by 1e15, grad by 1e13,
+          and eeg by 1e6.
+        * if scalings is :class:`dict`, keys are channel types and values
+          are scale factors.
+        * if ``scalings=='median'``,
+          :class:`sklearn.preprocessing.RobustScaler`
+          is used (requires sklearn version 0.17+).
+        * if ``scalings=='mean'``,
+          :class:`sklearn.preprocessing.StandardScaler`
+          is used.
+
+    with_mean : bool, default True
+        If True, center the data using mean (or median) before scaling.
+        Ignored for channel-type scaling.
+    with_std : bool, default True
+        If True, scale the data to unit variance (``scalings='mean'``),
+        quantile range (``scalings='median``), or using channel type
+        if ``scalings`` is a dict or None).
+    """
+
+    def __init__(self, info=None, scalings=None, with_mean=True, with_std=True):
+        self.info = info
+        self.with_mean = with_mean
+        self.with_std = with_std
+        self.scalings = scalings
+
+        if not (scalings is None or isinstance(scalings, dict | str)):
+            raise ValueError(
+                f"scalings type should be dict, str, or None, got {type(scalings)}"
+            )
+        if isinstance(scalings, str):
+            _check_option("scalings", scalings, ["mean", "median"])
+        if scalings is None or isinstance(scalings, dict):
+            if info is None:
+                raise ValueError(
+                    f'Need to specify "info" if scalings is {type(scalings)}'
+                )
+            self._scaler = _ConstantScaler(info, scalings, self.with_std)
+        elif scalings == "mean":
+            from sklearn.preprocessing import StandardScaler
+
+            self._scaler = StandardScaler(
+                with_mean=self.with_mean, with_std=self.with_std
+            )
+        else:  # scalings == 'median':
+            from sklearn.preprocessing import RobustScaler
+
+            self._scaler = RobustScaler(
+                with_centering=self.with_mean, with_scaling=self.with_std
+            )
+
+    def fit(self, epochs_data, y=None):
+        """Standardize data across channels.
+
+        Parameters
+        ----------
+        epochs_data : array, shape (n_epochs, n_channels, n_times)
+            The data to concatenate channels.
+        y : array, shape (n_epochs,)
+            The label for each epoch.
+
+        Returns
+        -------
+        self : instance of Scaler
+            The modified instance.
+        """
+        _validate_type(epochs_data, np.ndarray, "epochs_data")
+        if epochs_data.ndim == 2:
+            epochs_data = epochs_data[..., np.newaxis]
+        assert epochs_data.ndim == 3, epochs_data.shape
+        _sklearn_reshape_apply(self._scaler.fit, False, epochs_data, y=y)
+        return self
+
+    def transform(self, epochs_data):
+        """Standardize data across channels.
+
+        Parameters
+        ----------
+        epochs_data : array, shape (n_epochs, n_channels[, n_times])
+            The data.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The data concatenated over channels.
+
+        Notes
+        -----
+        This function makes a copy of the data before the operations and the
+        memory usage may be large with big data.
+        """
+        _validate_type(epochs_data, np.ndarray, "epochs_data")
+        if epochs_data.ndim == 2:  # can happen with SlidingEstimator
+            if self.info is not None:
+                assert len(self.info["ch_names"]) == epochs_data.shape[1]
+            epochs_data = epochs_data[..., np.newaxis]
+        assert epochs_data.ndim == 3, epochs_data.shape
+        return _sklearn_reshape_apply(self._scaler.transform, True, epochs_data)
+
+    def fit_transform(self, epochs_data, y=None):
+        """Fit to data, then transform it.
+
+        Fits transformer to epochs_data and y and returns a transformed version
+        of epochs_data.
+
+        Parameters
+        ----------
+        epochs_data : array, shape (n_epochs, n_channels, n_times)
+            The data.
+        y : None | array, shape (n_epochs,)
+            The label for each epoch.
+            Defaults to None.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The data concatenated over channels.
+
+        Notes
+        -----
+        This function makes a copy of the data before the operations and the
+        memory usage may be large with big data.
+        """
+        return self.fit(epochs_data, y).transform(epochs_data)
+
+    def inverse_transform(self, epochs_data):
+        """Invert standardization of data across channels.
+
+        Parameters
+        ----------
+        epochs_data : array, shape ([n_epochs, ]n_channels, n_times)
+            The data.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The data concatenated over channels.
+
+        Notes
+        -----
+        This function makes a copy of the data before the operations and the
+        memory usage may be large with big data.
+        """
+        squeeze = False
+        # Can happen with CSP
+        if epochs_data.ndim == 2:
+            squeeze = True
+            epochs_data = epochs_data[..., np.newaxis]
+        assert epochs_data.ndim == 3, epochs_data.shape
+        out = _sklearn_reshape_apply(self._scaler.inverse_transform, True, epochs_data)
+        if squeeze:
+            out = out[..., 0]
+        return out
+
+
+class Vectorizer(TransformerMixin):
+    """Transform n-dimensional array into 2D array of n_samples by n_features.
+
+    This class reshapes an n-dimensional array into an n_samples * n_features
+    array, usable by the estimators and transformers of scikit-learn.
+
+    Attributes
+    ----------
+    features_shape_ : tuple
+         Stores the original shape of data.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> clf = make_pipeline(Vectorizer(), StandardScaler(), LogisticRegression())
+    """
+
+    def fit(self, X, y=None):
+        """Store the shape of the features of X.
+
+        Parameters
+        ----------
+        X : array-like
+            The data to fit. Can be, for example a list, or an array of at
+            least 2d. The first dimension must be of length n_samples, where
+            samples are the independent samples used by the estimator
+            (e.g. n_epochs for epoched data).
+        y : None | array, shape (n_samples,)
+            Used for scikit-learn compatibility.
+
+        Returns
+        -------
+        self : instance of Vectorizer
+            Return the modified instance.
+        """
+        X = np.asarray(X)
+        self.features_shape_ = X.shape[1:]
+        return self
+
+    def transform(self, X):
+        """Convert given array into two dimensions.
+
+        Parameters
+        ----------
+        X : array-like
+            The data to fit. Can be, for example a list, or an array of at
+            least 2d. The first dimension must be of length n_samples, where
+            samples are the independent samples used by the estimator
+            (e.g. n_epochs for epoched data).
+
+        Returns
+        -------
+        X : array, shape (n_samples, n_features)
+            The transformed data.
+        """
+        X = np.asarray(X)
+        if X.shape[1:] != self.features_shape_:
+            raise ValueError("Shape of X used in fit and transform must be same")
+        return X.reshape(len(X), -1)
+
+    def fit_transform(self, X, y=None):
+        """Fit the data, then transform in one step.
+
+        Parameters
+        ----------
+        X : array-like
+            The data to fit. Can be, for example a list, or an array of at
+            least 2d. The first dimension must be of length n_samples, where
+            samples are the independent samples used by the estimator
+            (e.g. n_epochs for epoched data).
+        y : None | array, shape (n_samples,)
+            Used for scikit-learn compatibility.
+
+        Returns
+        -------
+        X : array, shape (n_samples, -1)
+            The transformed data.
+        """
+        return self.fit(X).transform(X)
+
+    def inverse_transform(self, X):
+        """Transform 2D data back to its original feature shape.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples,  n_features)
+            Data to be transformed back to original shape.
+
+        Returns
+        -------
+        X : array
+            The data transformed into shape as used in fit. The first
+            dimension is of length n_samples.
+        """
+        X = np.asarray(X)
+        if X.ndim not in (2, 3):
+            raise ValueError(
+                f"X should be of 2 or 3 dimensions but has shape {X.shape}"
+            )
+        return X.reshape(X.shape[:-1] + self.features_shape_)
+
+
+@fill_doc
+class PSDEstimator(TransformerMixin):
+    """Compute power spectral density (PSD) using a multi-taper method.
+
+    Parameters
+    ----------
+    sfreq : float
+        The sampling frequency.
+    fmin : float
+        The lower frequency of interest.
+    fmax : float
+        The upper frequency of interest.
+    bandwidth : float
+        The bandwidth of the multi taper windowing function in Hz.
+    adaptive : bool
+        Use adaptive weights to combine the tapered spectra into PSD
+        (slow, use n_jobs >> 1 to speed up computation).
+    low_bias : bool
+        Only use tapers with more than 90%% spectral concentration within
+        bandwidth.
+    n_jobs : int
+        Number of parallel jobs to use (only used if adaptive=True).
+    %(normalization)s
+    %(verbose)s
+
+    See Also
+    --------
+    mne.time_frequency.psd_array_multitaper
+    mne.io.Raw.compute_psd
+    mne.Epochs.compute_psd
+    mne.Evoked.compute_psd
+    """
+
+    @verbose
+    def __init__(
+        self,
+        sfreq=2 * np.pi,
+        fmin=0,
+        fmax=np.inf,
+        bandwidth=None,
+        adaptive=False,
+        low_bias=True,
+        n_jobs=None,
+        normalization="length",
+        *,
+        verbose=None,
+    ):
+        self.sfreq = sfreq
+        self.fmin = fmin
+        self.fmax = fmax
+        self.bandwidth = bandwidth
+        self.adaptive = adaptive
+        self.low_bias = low_bias
+        self.n_jobs = n_jobs
+        self.normalization = normalization
+
+    def fit(self, epochs_data, y):
+        """Compute power spectral density (PSD) using a multi-taper method.
+
+        Parameters
+        ----------
+        epochs_data : array, shape (n_epochs, n_channels, n_times)
+            The data.
+        y : array, shape (n_epochs,)
+            The label for each epoch.
+
+        Returns
+        -------
+        self : instance of PSDEstimator
+            The modified instance.
+        """
+        if not isinstance(epochs_data, np.ndarray):
+            raise ValueError(
+                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
+            )
+
+        return self
+
+    def transform(self, epochs_data):
+        """Compute power spectral density (PSD) using a multi-taper method.
+
+        Parameters
+        ----------
+        epochs_data : array, shape (n_epochs, n_channels, n_times)
+            The data.
+
+        Returns
+        -------
+        psd : array, shape (n_signals, n_freqs) or (n_freqs,)
+            The computed PSD.
+        """
+        if not isinstance(epochs_data, np.ndarray):
+            raise ValueError(
+                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
+            )
+        psd, _ = psd_array_multitaper(
+            epochs_data,
+            sfreq=self.sfreq,
+            fmin=self.fmin,
+            fmax=self.fmax,
+            bandwidth=self.bandwidth,
+            adaptive=self.adaptive,
+            low_bias=self.low_bias,
+            normalization=self.normalization,
+            n_jobs=self.n_jobs,
+        )
+        return psd
+
+
+@fill_doc
+class FilterEstimator(TransformerMixin):
+    """Estimator to filter RtEpochs.
+
+    Applies a zero-phase low-pass, high-pass, band-pass, or band-stop
+    filter to the channels selected by "picks".
+
+    l_freq and h_freq are the frequencies below which and above which,
+    respectively, to filter out of the data. Thus the uses are:
+
+        - l_freq < h_freq: band-pass filter
+        - l_freq > h_freq: band-stop filter
+        - l_freq is not None, h_freq is None: low-pass filter
+        - l_freq is None, h_freq is not None: high-pass filter
+
+    If n_jobs > 1, more memory is required as "len(picks) * n_times"
+    additional time points need to be temporarily stored in memory.
+
+    Parameters
+    ----------
+    %(info_not_none)s
+    %(l_freq)s
+    %(h_freq)s
+    %(picks_good_data)s
+    %(filter_length)s
+    %(l_trans_bandwidth)s
+    %(h_trans_bandwidth)s
+    n_jobs : int | str
+        Number of jobs to run in parallel.
+        Can be 'cuda' if ``cupy`` is installed properly and method='fir'.
+    method : str
+        'fir' will use overlap-add FIR filtering, 'iir' will use IIR filtering.
+    iir_params : dict | None
+        Dictionary of parameters to use for IIR filtering.
+        See mne.filter.construct_iir_filter for details. If iir_params
+        is None and method="iir", 4th order Butterworth will be used.
+    %(fir_design)s
+    %(verbose)s
+
+    See Also
+    --------
+    TemporalFilter
+
+    Notes
+    -----
+    This is primarily meant for use in realtime applications.
+    In general it is not recommended in a normal processing pipeline as it may result
+    in edge artifacts. Use with caution.
+    """
+
+    def __init__(
+        self,
+        info,
+        l_freq,
+        h_freq,
+        picks=None,
+        filter_length="auto",
+        l_trans_bandwidth="auto",
+        h_trans_bandwidth="auto",
+        n_jobs=None,
+        method="fir",
+        iir_params=None,
+        fir_design="firwin",
+        *,
+        verbose=None,
+    ):
+        self.info = info
+        self.l_freq = l_freq
+        self.h_freq = h_freq
+        self.picks = _picks_to_idx(info, picks)
+        self.filter_length = filter_length
+        self.l_trans_bandwidth = l_trans_bandwidth
+        self.h_trans_bandwidth = h_trans_bandwidth
+        self.n_jobs = n_jobs
+        self.method = method
+        self.iir_params = iir_params
+        self.fir_design = fir_design
+
+    def fit(self, epochs_data, y):
+        """Filter data.
+
+        Parameters
+        ----------
+        epochs_data : array, shape (n_epochs, n_channels, n_times)
+            The data.
+        y : array, shape (n_epochs,)
+            The label for each epoch.
+
+        Returns
+        -------
+        self : instance of FilterEstimator
+            The modified instance.
+        """
+        if not isinstance(epochs_data, np.ndarray):
+            raise ValueError(
+                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
+            )
+
+        if self.picks is None:
+            self.picks = pick_types(
+                self.info, meg=True, eeg=True, ref_meg=False, exclude=[]
+            )
+
+        if self.l_freq == 0:
+            self.l_freq = None
+        if self.h_freq is not None and self.h_freq > (self.info["sfreq"] / 2.0):
+            self.h_freq = None
+        if self.l_freq is not None and not isinstance(self.l_freq, float):
+            self.l_freq = float(self.l_freq)
+        if self.h_freq is not None and not isinstance(self.h_freq, float):
+            self.h_freq = float(self.h_freq)
+
+        if self.info["lowpass"] is None or (
+            self.h_freq is not None
+            and (self.l_freq is None or self.l_freq < self.h_freq)
+            and self.h_freq < self.info["lowpass"]
+        ):
+            with self.info._unlock():
+                self.info["lowpass"] = self.h_freq
+
+        if self.info["highpass"] is None or (
+            self.l_freq is not None
+            and (self.h_freq is None or self.l_freq < self.h_freq)
+            and self.l_freq > self.info["highpass"]
+        ):
+            with self.info._unlock():
+                self.info["highpass"] = self.l_freq
+
+        return self
+
+    def transform(self, epochs_data):
+        """Filter data.
+
+        Parameters
+        ----------
+        epochs_data : array, shape (n_epochs, n_channels, n_times)
+            The data.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The data after filtering.
+        """
+        if not isinstance(epochs_data, np.ndarray):
+            raise ValueError(
+                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
+            )
+        epochs_data = np.atleast_3d(epochs_data)
+        return filter_data(
+            epochs_data,
+            self.info["sfreq"],
+            self.l_freq,
+            self.h_freq,
+            self.picks,
+            self.filter_length,
+            self.l_trans_bandwidth,
+            self.h_trans_bandwidth,
+            method=self.method,
+            iir_params=self.iir_params,
+            n_jobs=self.n_jobs,
+            copy=False,
+            fir_design=self.fir_design,
+            verbose=False,
+        )
+
+
+class UnsupervisedSpatialFilter(TransformerMixin, BaseEstimator):
+    """Use unsupervised spatial filtering across time and samples.
+
+    Parameters
+    ----------
+    estimator : instance of sklearn.base.BaseEstimator
+        Estimator using some decomposition algorithm.
+    average : bool, default False
+        If True, the estimator is fitted on the average across samples
+        (e.g. epochs).
+    """
+
+    def __init__(self, estimator, average=False):
+        # XXX: Use _check_estimator #3381
+        for attr in ("fit", "transform", "fit_transform"):
+            if not hasattr(estimator, attr):
+                raise ValueError(
+                    "estimator must be a scikit-learn "
+                    f"transformer, missing {attr} method"
+                )
+
+        if not isinstance(average, bool):
+            raise ValueError(
+                f"average parameter must be of bool type, got {type(bool)} instead"
+            )
+
+        self.estimator = estimator
+        self.average = average
+
+    def fit(self, X, y=None):
+        """Fit the spatial filters.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The data to be filtered.
+        y : None | array, shape (n_samples,)
+            Used for scikit-learn compatibility.
+
+        Returns
+        -------
+        self : instance of UnsupervisedSpatialFilter
+            Return the modified instance.
+        """
+        if self.average:
+            X = np.mean(X, axis=0).T
+        else:
+            n_epochs, n_channels, n_times = X.shape
+            # trial as time samples
+            X = np.transpose(X, (1, 0, 2)).reshape((n_channels, n_epochs * n_times)).T
+        self.estimator.fit(X)
+        return self
+
+    def fit_transform(self, X, y=None):
+        """Transform the data to its filtered components after fitting.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The data to be filtered.
+        y : None | array, shape (n_samples,)
+            Used for scikit-learn compatibility.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The transformed data.
+        """
+        return self.fit(X).transform(X)
+
+    def transform(self, X):
+        """Transform the data to its spatial filters.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The data to be filtered.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The transformed data.
+        """
+        return self._apply_method(X, "transform")
+
+    def inverse_transform(self, X):
+        """Inverse transform the data to its original space.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_components, n_times)
+            The data to be inverted.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_channels, n_times)
+            The transformed data.
+        """
+        return self._apply_method(X, "inverse_transform")
+
+    def _apply_method(self, X, method):
+        """Vectorize time samples as trials, apply method and reshape back.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_dims, n_times)
+            The data to be inverted.
+
+        Returns
+        -------
+        X : array, shape (n_epochs, n_dims, n_times)
+            The transformed data.
+        """
+        n_epochs, n_channels, n_times = X.shape
+        # trial as time samples
+        X = np.transpose(X, [1, 0, 2])
+        X = np.reshape(X, [n_channels, n_epochs * n_times]).T
+        # apply method
+        method = getattr(self.estimator, method)
+        X = method(X)
+        # put it back to n_epochs, n_dimensions
+        X = np.reshape(X.T, [-1, n_epochs, n_times]).transpose([1, 0, 2])
+        return X
+
+
+@fill_doc
+class TemporalFilter(TransformerMixin):
+    """Estimator to filter data array along the last dimension.
+
+    Applies a zero-phase low-pass, high-pass, band-pass, or band-stop
+    filter to the channels.
+
+    l_freq and h_freq are the frequencies below which and above which,
+    respectively, to filter out of the data. Thus the uses are:
+
+        - l_freq < h_freq: band-pass filter
+        - l_freq > h_freq: band-stop filter
+        - l_freq is not None, h_freq is None: low-pass filter
+        - l_freq is None, h_freq is not None: high-pass filter
+
+    See :func:`mne.filter.filter_data`.
+
+    Parameters
+    ----------
+    l_freq : float | None
+        Low cut-off frequency in Hz. If None the data are only low-passed.
+    h_freq : float | None
+        High cut-off frequency in Hz. If None the data are only
+        high-passed.
+    sfreq : float, default 1.0
+        Sampling frequency in Hz.
+    filter_length : str | int, default 'auto'
+        Length of the FIR filter to use (if applicable):
+
+            * int: specified length in samples.
+            * 'auto' (default in 0.14): the filter length is chosen based
+              on the size of the transition regions (7 times the reciprocal
+              of the shortest transition band).
+            * str: (default in 0.13 is "10s") a human-readable time in
+              units of "s" or "ms" (e.g., "10s" or "5500ms") will be
+              converted to that number of samples if ``phase="zero"``, or
+              the shortest power-of-two length at least that duration for
+              ``phase="zero-double"``.
+
+    l_trans_bandwidth : float | str
+        Width of the transition band at the low cut-off frequency in Hz
+        (high pass or cutoff 1 in bandpass). Can be "auto"
+        (default in 0.14) to use a multiple of ``l_freq``::
+
+            min(max(l_freq * 0.25, 2), l_freq)
+
+        Only used for ``method='fir'``.
+    h_trans_bandwidth : float | str
+        Width of the transition band at the high cut-off frequency in Hz
+        (low pass or cutoff 2 in bandpass). Can be "auto"
+        (default in 0.14) to use a multiple of ``h_freq``::
+
+            min(max(h_freq * 0.25, 2.), info['sfreq'] / 2. - h_freq)
+
+        Only used for ``method='fir'``.
+    n_jobs : int | str, default 1
+        Number of jobs to run in parallel.
+        Can be 'cuda' if ``cupy`` is installed properly and method='fir'.
+    method : str, default 'fir'
+        'fir' will use overlap-add FIR filtering, 'iir' will use IIR
+        forward-backward filtering (via filtfilt).
+    iir_params : dict | None, default None
+        Dictionary of parameters to use for IIR filtering.
+        See mne.filter.construct_iir_filter for details. If iir_params
+        is None and method="iir", 4th order Butterworth will be used.
+    fir_window : str, default 'hamming'
+        The window to use in FIR design, can be "hamming", "hann",
+        or "blackman".
+    fir_design : str
+        Can be "firwin" (default) to use :func:`scipy.signal.firwin`,
+        or "firwin2" to use :func:`scipy.signal.firwin2`. "firwin" uses
+        a time-domain design technique that generally gives improved
+        attenuation using fewer samples than "firwin2".
+
+        .. versionadded:: 0.15
+    %(verbose)s
+
+    See Also
+    --------
+    FilterEstimator
+    Vectorizer
+    mne.filter.filter_data
+    """
+
+    @verbose
+    def __init__(
+        self,
+        l_freq=None,
+        h_freq=None,
+        sfreq=1.0,
+        filter_length="auto",
+        l_trans_bandwidth="auto",
+        h_trans_bandwidth="auto",
+        n_jobs=None,
+        method="fir",
+        iir_params=None,
+        fir_window="hamming",
+        fir_design="firwin",
+        *,
+        verbose=None,
+    ):
+        self.l_freq = l_freq
+        self.h_freq = h_freq
+        self.sfreq = sfreq
+        self.filter_length = filter_length
+        self.l_trans_bandwidth = l_trans_bandwidth
+        self.h_trans_bandwidth = h_trans_bandwidth
+        self.n_jobs = n_jobs
+        self.method = method
+        self.iir_params = iir_params
+        self.fir_window = fir_window
+        self.fir_design = fir_design
+
+        if not isinstance(self.n_jobs, int) and self.n_jobs == "cuda":
+            raise ValueError(
+                f'n_jobs must be int or "cuda", got {type(self.n_jobs)} instead.'
+            )
+
+    def fit(self, X, y=None):
+        """Do nothing (for scikit-learn compatibility purposes).
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_channels, n_times) or or shape (n_channels, n_times)
+            The data to be filtered over the last dimension. The channels
+            dimension can be zero when passing a 2D array.
+        y : None
+            Not used, for scikit-learn compatibility issues.
+
+        Returns
+        -------
+        self : instance of TemporalFilter
+            The modified instance.
+        """  # noqa: E501
+        return self
+
+    def transform(self, X):
+        """Filter data along the last dimension.
+
+        Parameters
+        ----------
+        X : array, shape (n_epochs, n_channels, n_times) or shape (n_channels, n_times)
+            The data to be filtered over the last dimension. The channels
+            dimension can be zero when passing a 2D array.
+
+        Returns
+        -------
+        X : array
+            The data after filtering.
+        """  # noqa: E501
+        X = np.atleast_2d(X)
+
+        if X.ndim > 3:
+            raise ValueError(
+                "Array must be of at max 3 dimensions instead "
+                f"got {X.ndim} dimensional matrix"
+            )
+
+        shape = X.shape
+        X = X.reshape(-1, shape[-1])
+        X = filter_data(
+            X,
+            self.sfreq,
+            self.l_freq,
+            self.h_freq,
+            filter_length=self.filter_length,
+            l_trans_bandwidth=self.l_trans_bandwidth,
+            h_trans_bandwidth=self.h_trans_bandwidth,
+            n_jobs=self.n_jobs,
+            method=self.method,
+            iir_params=self.iir_params,
+            copy=False,
+            fir_window=self.fir_window,
+            fir_design=self.fir_design,
+        )
+        return X.reshape(shape)