#!/usr/bin/python
# -*- coding: utf-8 -*-
import boto3
import botocore
import os
import os.path as op
import json
import sklearn as sk
import numpy as np
import glob
import nibabel as nib
[docs]def setup_credentials():
"""
Set Up AWS credentials from access keys into a credentials file
Inputs
----------
access_key : str
AWS_ACCESS_KEY_ID=XXXXXXXXXXXXXXXX
secret_access_key : str
AWS_SECRET_ACCESS_KEY=XXXXXXXXXXXXXXXX
Notes
------
This function will open/create a file '~/.aws/credentials', that
will then include a section:
[hcp]
AWS_ACCESS_KEY_ID=XXXXXXXXXXXXXXXX
AWS_SECRET_ACCESS_KEY=XXXXXXXXXXXXXXXX
The keys are credentials that you can get from HCP
(see https://wiki.humanconnectome.org/display/PublicData/How+To+Connect+to+Connectome+Data+via+AWS)
"""
access_key = input('Enter your HCP ACCESS KEY ID : ')
secret_access_key = input('Enter your HCP SECRET ACCESS KEY : ')
if not os.path.isdir(os.path.expanduser('~') + '/.aws'):
print("test")
os.makedirs(os.path.expanduser('~') + '/.aws')
cred_file = open(os.path.expanduser('~') + '/.aws/credentials', "w+")
if '[hcp]' in cred_file.read():
update = input(
"You have 'hcp' credentials set up already! Do you wish to update? [y/n] \n")
if update == 'y':
pass
else:
line1 = '[hcp]' + '\n'
line2 = 'aws_access_key_id = ' + access_key + '\n'
line3 = 'aws_secret_access_key = ' + secret_access_key + '\n'
cred_file.writelines([line1, line2, line3])
def explain_HCP():
"""
Lists all the files in a folder for a subject and explains what they are.
Returns
-------
A dictionary with a filename and function
Notes
-----
This function prints out a directory structure tree for an HCP subject
with a simple description of what each file is.
"""
def get_subjects(get_all=True, get_random=1):
"""
Fetch a list of HCP subjects
Parameters
----------
get_all : bool
Gets a list of all the subjects
get_random : int
Gets a random subset of subjects
Returns
-------
A list of UIDs for subjects in HCP
Notes
-----
...
"""
pass
[docs]def get_structural_data(subject_list, scan_type, preprocessed=True,
MNISpace=True, out_dir='.'):
"""
Gets structural data for a list of subjects, and stores
them in BIDS-like format in the specified output directory
Parameters
----------
subject_list : list
List of subjects to get data for
scan_type: list
List of types of structural scans to get
preprocessed : bool
Gets preprocessed data
MNISpace : bool
Gets data registered in MNI Space
out_dir : str
Path to output directory
Notes
-----
Local filenames are changed to match our expected conventions.
.. [1] Gorgolewski et al. (2016). The brain imaging data structure, a
format for organizing and describing outputs of neuroimaging experiments.
Scientific Data, 3:: 160044. DOI: 10.1038/sdata.2016.44.
"""
s3 = boto3.resource('s3')
boto3.setup_default_session(profile_name='hcp')
bucket = s3.Bucket('hcp-openaccess-temp')
root_dir = op.join(out_dir, 'hcp')
os.makedirs(root_dir, exist_ok=True)
if preprocessed and MNISpace:
for subject in subject_list:
subj_anat = op.join(root_dir, 'sub-{}'.format(subject), 'anat')
os.makedirs(subj_anat, exist_ok=True)
# subprocess.check_output(
# "mkdir -p {}{}/".format(output_dir, subject), shell=True)
src_folder = op.join('HCP_1200', subject, 'MNINonLinear')
for scan in scan_type:
src_file = op.join(src_folder,
'{}_restore_brain.nii.gz'.format(scan))
dst_file = op.join(subj_anat,
'sub-{}_{}.nii.gz'.format(subject, scan))
try:
bucket.download_file(src_file, dst_file)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("{} does not exist.".format(src_file))
else:
raise
dataset_description = {
"BIDSVersion": "1.0.0",
"Name": "HCP",
"Acknowledgements": """Data were provided by the Human Connectome Project,
WU-Minn Consortium (Principal Investigators: David Van Essen and Kamil
Ugurbil; 1U54MH091657) funded by the 16 NIH Institutes and Centers that
support the NIH Blueprint for Neuroscience Research; and by the McDonnell
Center for Systems Neuroscience at Washington University.""",
"Subjects": subject_list}
with open(op.join(root_dir, 'dataset_description.json'), 'w') as outfile:
json.dump(dataset_description, outfile)
[docs]def get_resting_data(subject_list,
scan_run=["rfMRI_REST1_LR", "rfMRI_REST2_LR",
"rfMRI_REST1_RL", "rfMRI_REST2_RL"],
preprocessed=True,
MNISpace=True, out_dir='.'):
"""
Gets resting data for runs for a list of subjects, and stores
them in BIDS-like format in the specified output directory
Parameters
----------
subject_list : list
List of subjects to get data for
scan_run: list
List of types of structural scans to get
preprocessed : bool
Gets preprocessed data
MNISpace : bool
Gets data registered in MNI Space
out_dir : str
Path to output directory
Notes
-----
Local filenames are changed to match our expected conventions.
.. [1] Gorgolewski et al. (2016). The brain imaging data structure, a
format for organizing and describing outputs of neuroimaging experiments.
Scientific Data, 3:: 160044. DOI: 10.1038/sdata.2016.44.
"""
s3 = boto3.resource('s3')
boto3.setup_default_session(profile_name='hcp')
bucket = s3.Bucket('hcp-openaccess-temp')
root_dir = op.join(out_dir, 'hcp')
os.makedirs(root_dir, exist_ok=True)
if preprocessed and MNISpace:
for subject in subject_list:
subj_func = op.join(root_dir, 'sub-{}'.format(subject), 'func')
os.makedirs(subj_func, exist_ok=True)
# subprocess.check_output(
# "mkdir -p {}{}/".format(output_dir, subject), shell=True)
src_folder = op.join('HCP_1200', subject, 'MNINonLinear',
'Results')
for scan in scan_run:
src_file = op.join(src_folder, '{}'.format(scan),
'{}_Atlas_MSMAll.dtseries.nii'.format(scan))
dst_file = op.join(subj_func,
'sub-{}_task-{}_run-01_bold.nii.gz'.format(subject, scan))
try:
bucket.download_file(src_file, dst_file)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("{} does not exist.".format(src_file))
else:
raise
dataset_description = {
"BIDSVersion": "1.0.0",
"Name": "HCP",
"Acknowledgements": """Data were provided by the Human Connectome Project WU-Minn Consortium (Principal Investigators: David Van Essen and Kamil Ugurbil 1U54MH091657) funded by the 16 NIH Institutes and Centers that support the NIH Blueprint for Neuroscience Research; and by the McDonnell Center for Systems Neuroscience at Washington University.""",
"Subjects": subject_list}
with open(op.join(root_dir, 'dataset_description.json'), 'w') as outfile:
json.dump(dataset_description, outfile)
[docs]def train_test_split(root: str,
split_folds: tuple([.7, .2, .1]),
scan_type: list,
convert_to_npy: bool=False) -> None:
"""
splits an hcp dataset into train, test, val and converts the .nii.gz
files to .npy for easier processing checks shape to ensure t1 and t2
are same dim
Parameters
----------
root: str
root directory where raw files are stored
split_folds: tuple(float, float, float)
What fraction to divide data in for train, test, val
scan_type: list
What scans to divive into train - test splits
"""
os.chdir(root)
subject_list = glob.glob('*')
shuffled_list = sk.utils.shuffle(subject_list, random_state=42)
n_subjects = len(shuffled_list)
train_list = subject_list[0:np.floor(n_subjects) * split_folds[0]]
test_list = subject_list[np.floor(
n_subjects) * split_folds[0]:np.floor(n_subjects) * split_folds[1]]
val_list = subject_list[np.floor(
n_subjects) * split_folds[1]:np.floor(n_subjects) * split_folds[2]]
data_splits = [train_list, test_list, val_list]
split_names = ['train', 'test', 'val']
for num, split in enumerate(split_names):
train_list = data_splits[num]
for subject in train_list:
for type in scan_type:
name = '/{}_restore_brain.nii.gz'.format(type)
os.rename(root + '/' + name, root + '/' +
split + '/' + subject + name)
if convert_to_npy:
t1_np = np.array(
nib.load(root + subject + '/').dataobj)
t2_np = np.array(
nib.load(root + subject + '/').dataobj)
t1_np = t1_np[2:-2, 27:-28, 40:-45]
t2_np = t2_np[2:-2, 27:-28, 40:-45]
for i in range(t1_np.shape[2]):
assert(t1_np.shape == t2_np.shape)
def fetch_hcp_diffusion(subjects, out_dir='.'):
"""
Fetch HCP diffusion data and arrange it in a manner that resembles the
BIDS[1]_ specification.
Parameters
----------
subjects: list
Each item is an integer, identifying one of the HCP subjects
Returns
-------
dict with remote and local names of these files.
Notes
-----
To use this function, please setup credentials either manually,
or by running `setup_credentials()`
Local filenames are changed to match our expected conventions.
.. [1] Gorgolewski et al. (2016). The brain imaging data structure,
a format for organizing and describing outputs of neuroimaging
experiments. Scientific Data, 3:: 160044. DOI: 10.1038/sdata.2016.44.
"""
boto3.setup_default_session(profile_name='hcp')
s3 = boto3.resource('s3')
bucket = s3.Bucket('hcp-openaccess-temp')
base_dir = op.join(out_dir, "hcp")
if not os.path.exists(base_dir):
os.makedirs(base_dir, exist_ok=True)
data_files = {}
for subject in subjects:
# We make a single session folder per subject for this case, because
# AFQ api expects session structure:
sub_dir = op.join(base_dir, 'sub-%s' % subject)
sess_dir = op.join(sub_dir, "sess-01")
if not os.path.exists(sub_dir):
os.mkdir(sub_dir)
os.mkdir(sess_dir)
os.mkdir(os.path.join(sess_dir, 'dwi'))
os.mkdir(os.path.join(sess_dir, 'anat'))
data_files[op.join(sess_dir, 'dwi', 'sub-%s_dwi.bval' % subject)] =\
'HCP/%s/T1w/Diffusion/bvals' % subject
data_files[op.join(sess_dir, 'dwi', 'sub-%s_dwi.bvec' % subject)] =\
'HCP/%s/T1w/Diffusion/bvecs' % subject
data_files[op.join(sess_dir, 'dwi', 'sub-%s_dwi.nii.gz' % subject)] =\
'HCP/%s/T1w/Diffusion/data.nii.gz' % subject
data_files[op.join(sess_dir, 'anat', 'sub-%s_T1w.nii.gz' % subject)] =\
'HCP/%s/T1w/T1w_acpc_dc.nii.gz' % subject
data_files[op.join(sess_dir, 'anat',
'sub-%s_aparc+aseg.nii.gz' % subject)] =\
'HCP/%s/T1w/aparc+aseg.nii.gz' % subject
for k in data_files.keys():
if not op.exists(k):
bucket.download_file(data_files[k], k)
# Create the BIDS dataset description file text
dataset_description = {
"BIDSVersion": "1.0.0",
"Name": "HCP",
"Acknowledgements": """Data were provided by the Human Connectome
Project, WU-Minn Consortium (Principal Investigators: David Van
Essen and Kamil Ugurbil; 1U54MH091657) funded by the 16 NIH Institutes
and Centers that support the NIH Blueprint for Neuroscience Research;
and by the McDonnell Center for Systems Neuroscience at Washington University.""", # noqa
"Subjects": subjects}
with open(op.join(base_dir, 'dataset_description.json'), 'w') as outfile:
json.dump(dataset_description, outfile)
return data_files