Source code for easyhcp.hcpscraper

#!/usr/bin/python
# -*- coding: utf-8 -*-

import boto3
import botocore
import os
import os.path as op
import json
import sklearn as sk
import numpy as np
import glob
import nibabel as nib


[docs]def setup_credentials():
    """
    Set Up AWS credentials from access keys into a credentials file

    Inputs
    ----------
    access_key : str
        AWS_ACCESS_KEY_ID=XXXXXXXXXXXXXXXX
    secret_access_key : str
        AWS_SECRET_ACCESS_KEY=XXXXXXXXXXXXXXXX

    Notes
    ------
    This function will open/create a file '~/.aws/credentials', that
    will then include a section:
    [hcp]
    AWS_ACCESS_KEY_ID=XXXXXXXXXXXXXXXX
    AWS_SECRET_ACCESS_KEY=XXXXXXXXXXXXXXXX
    The keys are credentials that you can get from HCP
    (see https://wiki.humanconnectome.org/display/PublicData/How+To+Connect+to+Connectome+Data+via+AWS)
    """
    access_key = input('Enter your HCP ACCESS KEY ID : ')
    secret_access_key = input('Enter your HCP SECRET ACCESS KEY : ')

    if not os.path.isdir(os.path.expanduser('~') + '/.aws'):
        print("test")
        os.makedirs(os.path.expanduser('~') + '/.aws')

    cred_file = open(os.path.expanduser('~') + '/.aws/credentials', "w+")
    if '[hcp]' in cred_file.read():
        update = input(
            "You have 'hcp' credentials set up already! Do you wish to update? [y/n] \n")
        if update == 'y':
            pass
    else:
        line1 = '[hcp]' + '\n'
        line2 = 'aws_access_key_id = ' + access_key + '\n'
        line3 = 'aws_secret_access_key = ' + secret_access_key + '\n'
        cred_file.writelines([line1, line2, line3])


def explain_HCP():
    """
    Lists all the files in a folder for a subject and explains what they are.

    Returns
    -------
    A dictionary with a filename and function

    Notes
    -----
    This function prints out a directory structure tree for an HCP subject
    with a simple description of what each file is.
    """


def get_subjects(get_all=True, get_random=1):
    """
    Fetch a list of HCP subjects

    Parameters
    ----------
    get_all : bool
       Gets a list of all the subjects
    get_random : int
        Gets a random subset of subjects

    Returns
    -------
    A list of UIDs for subjects in HCP

    Notes
    -----
    ...
    """
    pass


[docs]def get_structural_data(subject_list, scan_type, preprocessed=True,
                        MNISpace=True, out_dir='.'):
    """
    Gets structural data for a list of subjects, and stores
    them in BIDS-like format in the specified output directory

    Parameters
    ----------
    subject_list : list
        List of subjects to get data for
    scan_type: list
        List of types of structural scans to get
    preprocessed : bool
        Gets preprocessed data
    MNISpace : bool
        Gets data registered in MNI Space
    out_dir : str
        Path to output directory

    Notes
    -----
    Local filenames are changed to match our expected conventions.
    .. [1] Gorgolewski et al. (2016). The brain imaging data structure, a
    format for organizing and describing outputs of neuroimaging experiments.
    Scientific Data, 3:: 160044. DOI: 10.1038/sdata.2016.44.
    """
    s3 = boto3.resource('s3')
    boto3.setup_default_session(profile_name='hcp')
    bucket = s3.Bucket('hcp-openaccess-temp')

    root_dir = op.join(out_dir, 'hcp')
    os.makedirs(root_dir, exist_ok=True)

    if preprocessed and MNISpace:

        for subject in subject_list:
            subj_anat = op.join(root_dir, 'sub-{}'.format(subject), 'anat')
            os.makedirs(subj_anat, exist_ok=True)

            # subprocess.check_output(
            #     "mkdir -p {}{}/".format(output_dir, subject), shell=True)

            src_folder = op.join('HCP_1200', subject, 'MNINonLinear')

            for scan in scan_type:
                src_file = op.join(src_folder,
                                   '{}_restore_brain.nii.gz'.format(scan))
                dst_file = op.join(subj_anat,
                                   'sub-{}_{}.nii.gz'.format(subject, scan))
                try:
                    bucket.download_file(src_file, dst_file)
                except botocore.exceptions.ClientError as e:
                    if e.response['Error']['Code'] == "404":
                        print("{} does not exist.".format(src_file))
                    else:
                        raise

    dataset_description = {
        "BIDSVersion": "1.0.0",
        "Name": "HCP",
        "Acknowledgements": """Data were provided by the Human Connectome Project, 
        WU-Minn Consortium (Principal Investigators: David Van Essen and Kamil
        Ugurbil; 1U54MH091657) funded by the 16 NIH Institutes and Centers that
        support the NIH Blueprint for Neuroscience Research; and by the McDonnell
        Center for Systems Neuroscience at Washington University.""",
        "Subjects": subject_list}

    with open(op.join(root_dir, 'dataset_description.json'), 'w') as outfile:
        json.dump(dataset_description, outfile)


[docs]def get_resting_data(subject_list,
                     scan_run=["rfMRI_REST1_LR", "rfMRI_REST2_LR",
                               "rfMRI_REST1_RL", "rfMRI_REST2_RL"],
                     preprocessed=True,
                     MNISpace=True, out_dir='.'):
    """
    Gets resting data for runs for a list of subjects, and stores
    them in BIDS-like format in the specified output directory

    Parameters
    ----------
    subject_list : list
        List of subjects to get data for
    scan_run: list
        List of types of structural scans to get
    preprocessed : bool
        Gets preprocessed data
    MNISpace : bool
        Gets data registered in MNI Space
    out_dir : str
        Path to output directory

    Notes
    -----
    Local filenames are changed to match our expected conventions.
    .. [1] Gorgolewski et al. (2016). The brain imaging data structure, a
    format for organizing and describing outputs of neuroimaging experiments.
    Scientific Data, 3:: 160044. DOI: 10.1038/sdata.2016.44.
    """
    s3 = boto3.resource('s3')
    boto3.setup_default_session(profile_name='hcp')
    bucket = s3.Bucket('hcp-openaccess-temp')

    root_dir = op.join(out_dir, 'hcp')
    os.makedirs(root_dir, exist_ok=True)

    if preprocessed and MNISpace:

        for subject in subject_list:
            subj_func = op.join(root_dir, 'sub-{}'.format(subject), 'func')
            os.makedirs(subj_func, exist_ok=True)

            # subprocess.check_output(
            #     "mkdir -p {}{}/".format(output_dir, subject), shell=True)

            src_folder = op.join('HCP_1200', subject, 'MNINonLinear',
                                 'Results')

            for scan in scan_run:
                src_file = op.join(src_folder, '{}'.format(scan),
                                   '{}_Atlas_MSMAll.dtseries.nii'.format(scan))
                dst_file = op.join(subj_func,
                                   'sub-{}_task-{}_run-01_bold.nii.gz'.format(subject, scan))
                try:
                    bucket.download_file(src_file, dst_file)
                except botocore.exceptions.ClientError as e:
                    if e.response['Error']['Code'] == "404":
                        print("{} does not exist.".format(src_file))
                    else:
                        raise

    dataset_description = {
        "BIDSVersion": "1.0.0",
        "Name": "HCP",
        "Acknowledgements": """Data were provided by the Human Connectome Project WU-Minn Consortium (Principal Investigators: David Van Essen and Kamil Ugurbil 1U54MH091657) funded by the 16 NIH Institutes and Centers that support the NIH Blueprint for Neuroscience Research; and by the McDonnell Center for Systems Neuroscience at Washington University.""",
        "Subjects": subject_list}

    with open(op.join(root_dir, 'dataset_description.json'), 'w') as outfile:
        json.dump(dataset_description, outfile)


[docs]def train_test_split(root: str,
                     split_folds: tuple([.7, .2, .1]),
                     scan_type: list,
                     convert_to_npy: bool=False) -> None:
    """
    splits an hcp dataset into train, test, val and converts the .nii.gz
    files to .npy for easier processing checks shape to ensure t1 and t2
    are same dim

    Parameters
    ----------
    root: str
        root directory where raw files are stored
    split_folds: tuple(float, float, float)
        What fraction to divide data in for train, test, val
    scan_type: list
        What scans to divive into train - test splits
    """
    os.chdir(root)
    subject_list = glob.glob('*')
    shuffled_list = sk.utils.shuffle(subject_list, random_state=42)
    n_subjects = len(shuffled_list)
    train_list = subject_list[0:np.floor(n_subjects) * split_folds[0]]
    test_list = subject_list[np.floor(
        n_subjects) * split_folds[0]:np.floor(n_subjects) * split_folds[1]]
    val_list = subject_list[np.floor(
        n_subjects) * split_folds[1]:np.floor(n_subjects) * split_folds[2]]
    data_splits = [train_list, test_list, val_list]
    split_names = ['train', 'test', 'val']
    for num, split in enumerate(split_names):
        train_list = data_splits[num]
        for subject in train_list:
            for type in scan_type:
                name = '/{}_restore_brain.nii.gz'.format(type)
                os.rename(root + '/' + name, root + '/' +
                          split + '/' + subject + name)
                if convert_to_npy:
                    t1_np = np.array(
                        nib.load(root + subject + '/').dataobj)
                    t2_np = np.array(
                        nib.load(root + subject + '/').dataobj)
                    t1_np = t1_np[2:-2, 27:-28, 40:-45]
                    t2_np = t2_np[2:-2, 27:-28, 40:-45]
                    for i in range(t1_np.shape[2]):
                        assert(t1_np.shape == t2_np.shape)


def fetch_hcp_diffusion(subjects, out_dir='.'):
    """
    Fetch HCP diffusion data and arrange it in a manner that resembles the
    BIDS[1]_ specification.
    Parameters
    ----------
    subjects: list
       Each item is an integer, identifying one of the HCP subjects
    Returns
    -------
    dict with remote and local names of these files.
    Notes
    -----
    To use this function, please setup credentials either manually,
    or by running `setup_credentials()`
    Local filenames are changed to match our expected conventions.
    .. [1] Gorgolewski et al. (2016). The brain imaging data structure,
           a format for organizing and describing outputs of neuroimaging
           experiments. Scientific Data, 3:: 160044. DOI: 10.1038/sdata.2016.44.
    """
    boto3.setup_default_session(profile_name='hcp')
    s3 = boto3.resource('s3')
    bucket = s3.Bucket('hcp-openaccess-temp')
    base_dir = op.join(out_dir, "hcp")
    if not os.path.exists(base_dir):
        os.makedirs(base_dir, exist_ok=True)

    data_files = {}
    for subject in subjects:
        # We make a single session folder per subject for this case, because
        # AFQ api expects session structure:
        sub_dir = op.join(base_dir, 'sub-%s' % subject)
        sess_dir = op.join(sub_dir, "sess-01")
        if not os.path.exists(sub_dir):
            os.mkdir(sub_dir)
            os.mkdir(sess_dir)
            os.mkdir(os.path.join(sess_dir, 'dwi'))
            os.mkdir(os.path.join(sess_dir, 'anat'))
        data_files[op.join(sess_dir, 'dwi', 'sub-%s_dwi.bval' % subject)] =\
            'HCP/%s/T1w/Diffusion/bvals' % subject
        data_files[op.join(sess_dir, 'dwi', 'sub-%s_dwi.bvec' % subject)] =\
            'HCP/%s/T1w/Diffusion/bvecs' % subject
        data_files[op.join(sess_dir, 'dwi', 'sub-%s_dwi.nii.gz' % subject)] =\
            'HCP/%s/T1w/Diffusion/data.nii.gz' % subject
        data_files[op.join(sess_dir, 'anat', 'sub-%s_T1w.nii.gz' % subject)] =\
            'HCP/%s/T1w/T1w_acpc_dc.nii.gz' % subject
        data_files[op.join(sess_dir, 'anat',
                           'sub-%s_aparc+aseg.nii.gz' % subject)] =\
            'HCP/%s/T1w/aparc+aseg.nii.gz' % subject

    for k in data_files.keys():
        if not op.exists(k):
            bucket.download_file(data_files[k], k)
    # Create the BIDS dataset description file text
    dataset_description = {
         "BIDSVersion": "1.0.0",
         "Name": "HCP",
         "Acknowledgements": """Data were provided by the Human Connectome 
         Project, WU-Minn Consortium (Principal Investigators: David Van 
        Essen and Kamil Ugurbil; 1U54MH091657) funded by the 16 NIH Institutes
        and Centers that support the NIH Blueprint for Neuroscience Research;
       and by the McDonnell Center for Systems Neuroscience at Washington University.""",  # noqa
         "Subjects": subjects}

    with open(op.join(base_dir, 'dataset_description.json'), 'w') as outfile:
        json.dump(dataset_description, outfile)

    return data_files