initial commit
This commit is contained in:
307
mne/datasets/_fetch.py
Normal file
307
mne/datasets/_fetch.py
Normal file
@@ -0,0 +1,307 @@
|
||||
# Authors: The MNE-Python contributors.
|
||||
# License: BSD-3-Clause
|
||||
# Copyright the MNE-Python contributors.
|
||||
|
||||
from __future__ import annotations # only needed for Python ≤ 3.9
|
||||
|
||||
import os
|
||||
import os.path as op
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from shutil import rmtree
|
||||
|
||||
from .. import __version__ as mne_version
|
||||
from ..fixes import _compare_version
|
||||
from ..utils import _safe_input, logger, warn
|
||||
from .config import (
|
||||
MISC_VERSIONED,
|
||||
RELEASES,
|
||||
TESTING_VERSIONED,
|
||||
_bst_license_text,
|
||||
)
|
||||
from .utils import (
|
||||
_dataset_version,
|
||||
_do_path_update,
|
||||
_downloader_params,
|
||||
_get_path,
|
||||
_log_time_size,
|
||||
)
|
||||
|
||||
_FAKE_VERSION = None # used for monkeypatching while testing versioning
|
||||
|
||||
|
||||
def fetch_dataset(
|
||||
dataset_params,
|
||||
processor=None,
|
||||
path=None,
|
||||
force_update=False,
|
||||
update_path=True,
|
||||
download=True,
|
||||
check_version=False,
|
||||
return_version=False,
|
||||
accept=False,
|
||||
auth=None,
|
||||
token=None,
|
||||
) -> Path | tuple[Path, str]:
|
||||
"""Fetch an MNE-compatible dataset using pooch.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_params : list of dict | dict
|
||||
The dataset name(s) and corresponding parameters to download the
|
||||
dataset(s). The dataset parameters that contains the following keys:
|
||||
``archive_name``, ``url``, ``folder_name``, ``hash``,
|
||||
``config_key`` (optional). See Notes.
|
||||
processor : None | "unzip" | "untar" | instance of pooch.Unzip | instance of pooch.Untar
|
||||
What to do after downloading the file. ``"unzip"`` and ``"untar"`` will
|
||||
decompress the downloaded file in place; for custom extraction (e.g.,
|
||||
only extracting certain files from the archive) pass an instance of
|
||||
``pooch.Unzip`` or ``pooch.Untar``. If ``None`` (the
|
||||
default), the files are left as-is.
|
||||
path : None | str
|
||||
Directory in which to put the dataset. If ``None``, the dataset
|
||||
location is determined by first checking whether
|
||||
``dataset_params['config_key']`` is defined, and if so, whether that
|
||||
config key exists in the MNE-Python config file. If so, the configured
|
||||
path is used; if not, the location is set to the value of the
|
||||
``MNE_DATA`` config key (if it exists), or ``~/mne_data`` otherwise.
|
||||
force_update : bool
|
||||
Force update of the dataset even if a local copy exists.
|
||||
Default is False.
|
||||
update_path : bool | None
|
||||
If True (default), set the mne-python config to the given
|
||||
path. If None, the user is prompted.
|
||||
download : bool
|
||||
If False and the dataset has not been downloaded yet, it will not be
|
||||
downloaded and the path will be returned as ``''`` (empty string). This
|
||||
is mostly used for testing purposes and can be safely ignored by most
|
||||
users.
|
||||
check_version : bool
|
||||
Whether to check the version of the dataset or not. Each version
|
||||
of the dataset is stored in the root with a ``version.txt`` file.
|
||||
return_version : bool
|
||||
Whether or not to return the version of the dataset or not.
|
||||
Defaults to False.
|
||||
accept : bool
|
||||
Some MNE-supplied datasets require acceptance of an additional license.
|
||||
Default is ``False``.
|
||||
auth : tuple | None
|
||||
Optional authentication tuple containing the username and
|
||||
password/token, passed to ``pooch.HTTPDownloader`` (e.g.,
|
||||
``auth=('foo', 012345)``).
|
||||
token : str | None
|
||||
Optional authentication token passed to ``pooch.HTTPDownloader``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data_path : instance of Path
|
||||
The path to the fetched dataset.
|
||||
version : str
|
||||
Only returned if ``return_version`` is True.
|
||||
|
||||
See Also
|
||||
--------
|
||||
mne.get_config
|
||||
mne.set_config
|
||||
mne.datasets.has_dataset
|
||||
|
||||
Notes
|
||||
-----
|
||||
The ``dataset_params`` argument must contain the following keys:
|
||||
|
||||
- ``archive_name``: The name of the (possibly compressed) file to download
|
||||
- ``url``: URL from which the file can be downloaded
|
||||
- ``folder_name``: the subfolder within the ``MNE_DATA`` folder in which to
|
||||
save and uncompress (if needed) the file(s)
|
||||
- ``hash``: the cryptographic hash type of the file followed by a colon and
|
||||
then the hash value (examples: "sha256:19uheid...", "md5:upodh2io...")
|
||||
- ``config_key`` (optional): key passed to :func:`mne.set_config` to store
|
||||
the on-disk location of the downloaded dataset (e.g.,
|
||||
``"MNE_DATASETS_EEGBCI_PATH"``). This will only work for the provided
|
||||
datasets listed :ref:`here <datasets>`; do not use for user-defined
|
||||
datasets.
|
||||
|
||||
An example would look like::
|
||||
|
||||
{'dataset_name': 'sample',
|
||||
'archive_name': 'MNE-sample-data-processed.tar.gz',
|
||||
'hash': 'md5:12b75d1cb7df9dfb4ad73ed82f61094f',
|
||||
'url': 'https://osf.io/86qa2/download?version=5',
|
||||
'folder_name': 'MNE-sample-data',
|
||||
'config_key': 'MNE_DATASETS_SAMPLE_PATH'}
|
||||
|
||||
For datasets where a single (possibly compressed) file must be downloaded,
|
||||
pass a single :class:`dict` as ``dataset_params``. For datasets where
|
||||
multiple files must be downloaded and (optionally) uncompressed separately,
|
||||
pass a list of dicts.
|
||||
""" # noqa E501
|
||||
import pooch
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
if auth is not None:
|
||||
if len(auth) != 2:
|
||||
raise RuntimeError(
|
||||
"auth should be a 2-tuple consisting "
|
||||
"of a username and password/token."
|
||||
)
|
||||
|
||||
# processor to uncompress files
|
||||
if processor == "untar":
|
||||
processor = pooch.Untar(extract_dir=path)
|
||||
elif processor == "unzip":
|
||||
processor = pooch.Unzip(extract_dir=path)
|
||||
|
||||
if isinstance(dataset_params, dict):
|
||||
dataset_params = [dataset_params]
|
||||
|
||||
# extract configuration parameters
|
||||
names = [params["dataset_name"] for params in dataset_params]
|
||||
name = names[0]
|
||||
dataset_dict = dataset_params[0]
|
||||
config_key = dataset_dict.get("config_key", None)
|
||||
folder_name = dataset_dict["folder_name"]
|
||||
|
||||
# get download path for specific dataset
|
||||
path = _get_path(path=path, key=config_key, name=name)
|
||||
|
||||
# get the actual path to each dataset folder name
|
||||
final_path = op.join(path, folder_name)
|
||||
|
||||
# handle BrainStorm datasets with nested folders for datasets
|
||||
if name.startswith("bst_"):
|
||||
final_path = op.join(final_path, name)
|
||||
|
||||
final_path = Path(final_path)
|
||||
|
||||
# additional condition: check for version.txt and parse it
|
||||
# check if testing or misc data is outdated; if so, redownload it
|
||||
want_version = RELEASES.get(name, None)
|
||||
want_version = _FAKE_VERSION if name == "fake" else want_version
|
||||
|
||||
# get the version of the dataset and then check if the version is outdated
|
||||
data_version = _dataset_version(final_path, name)
|
||||
outdated = want_version is not None and _compare_version(
|
||||
want_version, ">", data_version
|
||||
)
|
||||
|
||||
if outdated:
|
||||
logger.info(
|
||||
f"Dataset {name} version {data_version} out of date, "
|
||||
f"latest version is {want_version}"
|
||||
)
|
||||
empty = Path("")
|
||||
|
||||
# return empty string if outdated dataset and we don't want to download
|
||||
if (not force_update) and outdated and not download:
|
||||
logger.info(
|
||||
"Dataset out of date but force_update=False and download=False, "
|
||||
"returning empty data_path"
|
||||
)
|
||||
return (empty, data_version) if return_version else empty
|
||||
|
||||
# reasons to bail early (hf_sef has separate code for this):
|
||||
if (not force_update) and (not outdated) and (not name.startswith("hf_sef_")):
|
||||
# ...if target folder exists (otherwise pooch downloads every
|
||||
# time because we don't save the archive files after unpacking, so
|
||||
# pooch can't check its checksum)
|
||||
if op.isdir(final_path):
|
||||
if config_key is not None:
|
||||
_do_path_update(path, update_path, config_key, name)
|
||||
return (final_path, data_version) if return_version else final_path
|
||||
# ...if download=False (useful for debugging)
|
||||
elif not download:
|
||||
return (empty, data_version) if return_version else empty
|
||||
# ...if user didn't accept the license
|
||||
elif name.startswith("bst_"):
|
||||
if accept or "--accept-brainstorm-license" in sys.argv:
|
||||
answer = "y"
|
||||
else:
|
||||
# If they don't have stdin, just accept the license
|
||||
# https://github.com/mne-tools/mne-python/issues/8513#issuecomment-726823724 # noqa: E501
|
||||
answer = _safe_input(f"{_bst_license_text}Agree (y/[n])? ", use="y")
|
||||
if answer.lower() != "y":
|
||||
raise RuntimeError("You must agree to the license to use this dataset")
|
||||
# downloader & processors
|
||||
download_params = _downloader_params(auth=auth, token=token)
|
||||
if name == "fake":
|
||||
download_params["progressbar"] = False
|
||||
downloader = pooch.HTTPDownloader(**download_params)
|
||||
|
||||
# make mappings from archive names to urls and to checksums
|
||||
urls = dict()
|
||||
registry = dict()
|
||||
for idx, this_name in enumerate(names):
|
||||
this_dataset = dataset_params[idx]
|
||||
archive_name = this_dataset["archive_name"]
|
||||
dataset_url = this_dataset["url"]
|
||||
dataset_hash = this_dataset["hash"]
|
||||
urls[archive_name] = dataset_url
|
||||
registry[archive_name] = dataset_hash
|
||||
|
||||
# create the download manager
|
||||
use_path = final_path if processor is None else Path(path)
|
||||
fetcher = pooch.create(
|
||||
path=str(use_path),
|
||||
base_url="", # Full URLs are given in the `urls` dict.
|
||||
version=None, # Data versioning is decoupled from MNE-Python version.
|
||||
urls=urls,
|
||||
registry=registry,
|
||||
retry_if_failed=2, # 2 retries = 3 total attempts
|
||||
)
|
||||
|
||||
# use our logger level for pooch's logger too
|
||||
pooch.get_logger().setLevel(logger.getEffectiveLevel())
|
||||
sz = 0
|
||||
|
||||
for idx in range(len(names)):
|
||||
# fetch and unpack the data
|
||||
archive_name = dataset_params[idx]["archive_name"]
|
||||
try:
|
||||
fetcher.fetch(
|
||||
fname=archive_name, downloader=downloader, processor=processor
|
||||
)
|
||||
except ValueError as err:
|
||||
err = str(err)
|
||||
if "hash of downloaded file" in str(err):
|
||||
raise ValueError(
|
||||
f"{err} Consider using force_update=True to force "
|
||||
"the dataset to be downloaded again."
|
||||
) from None
|
||||
else:
|
||||
raise
|
||||
fname = use_path / archive_name
|
||||
sz += fname.stat().st_size
|
||||
# after unpacking, remove the archive file
|
||||
if processor is not None:
|
||||
fname.unlink()
|
||||
|
||||
# remove version number from "misc" and "testing" datasets folder names
|
||||
if name == "misc":
|
||||
rmtree(final_path, ignore_errors=True)
|
||||
os.replace(op.join(path, MISC_VERSIONED), final_path)
|
||||
elif name == "testing":
|
||||
rmtree(final_path, ignore_errors=True)
|
||||
os.replace(op.join(path, TESTING_VERSIONED), final_path)
|
||||
|
||||
# maybe update the config
|
||||
if config_key is not None:
|
||||
old_name = "brainstorm" if name.startswith("bst_") else name
|
||||
_do_path_update(path, update_path, config_key, old_name)
|
||||
|
||||
# compare the version of the dataset and mne
|
||||
data_version = _dataset_version(path, name)
|
||||
# 0.7 < 0.7.git should be False, therefore strip
|
||||
if check_version and (
|
||||
_compare_version(data_version, "<", mne_version.strip(".git"))
|
||||
):
|
||||
# OK to `nosec` because it's false positive (misidentified as SQL)
|
||||
warn(
|
||||
f"The {name} dataset (version {data_version}) is older than "
|
||||
f"mne-python (version {mne_version}). If the examples fail, "
|
||||
f"you may need to update the {name} dataset by using "
|
||||
f"mne.datasets.{name}.data_path(force_update=True)" # nosec B608
|
||||
)
|
||||
_log_time_size(t0, sz)
|
||||
return (final_path, data_version) if return_version else final_path
|
||||
Reference in New Issue
Block a user