01_download_data.py

################################################################################
## Download competition data for the
## ENCODE-DREAM in vivo Transcription Factor Binding Site Prediction Challenge
################################################################################

import shutil
import synapseclient
from synapseclient import Project, Folder, File
import sys
import os

syn = synapseclient.Synapse()

# If you haven't set up a .synapseConfig file, you'll have to supply credentials
syn.login()

print "Make sure you've accepted the terms of use before running this script!"

# You may wish to copy these files to a specific destination directory. If so,
# set the path to that directory here or pass it as an argument to the script.
datadir = os.environ["DREAM_DATA"] + "/"

# -------------------------------------------------------------------------------
# All Challenge Data available for download:
# (* indicates data from the 'Essential Data Collection' - see https://www.synapse.org/#!Synapse:syn6131484/wiki/402033 )
# -------------------------------------------------------------------------------
# * ChIPseq fold_change_signal = syn6181334
# * ChIPseq labels = syn6181335
# * ChIPseq peaks conservative = syn6181337
# * ChIPseq peaks relaxed = syn6181338
# DNASE bams = syn6176232
# * DNASE fold_coverage_wiggles = syn6176233
# * DNASE peaks conservative = syn6176235
# * DNASE peaks relaxed = syn6176236
# * RNAseq = syn6176231
# * annotations = 'syn6184307'
# -------------------------------------------------------------------------------

# As written, this script will download the entire Essential Data Collection
# MODIFY THIS LINE TO INCLUDE THE SYNAPSE IDS OF DATA TYPES YOU WANT TO DOWNLOAD
folders = {
        'syn7413983': 'ChIPseq/labels', # chip data
        'syn6181334': 'ChIPseq/fold_change_signal', # chip data
        'syn6176233': 'essential_training_data/DNASE/fold_coverage_wiggles', # dnase data
        'syn6176232': 'essential_training_data/DNASE/bams', # dnase data
        'syn6176231': 'RNAseq/', # rnaseq data
        'syn6184307': 'annotations/', # rnaseq data
        'syn8077511': 'ChIPseq/within_cell/', # ChIP-seq data for the within-celltype phase
        'syn8442975': 'ChIPseq/heldout_celltypes/', # ChIP-seq data (Post-challenge release)
        'syn8441154': 'ChIPseq/heldout_chr/', # ChIP-seq data (Post-challenge release)
        }


for folder in folders:
    # Get folder
    folder_ = syn.get(folder)
    print 'Downloading contents of %s folder (%s)\n' % (folder_.name, folder_.id,)
    # Query for child entities
    query_results = syn.query('select id,name from file where parentId=="%s"' % folder)
    data_files = []
    if not os.path.exists( datadir + folders[folder]):
            os.makedirs(datadir + folders[folder])
    # Download all data files
    for entity in query_results['results']:
        if 'syn7444261' == entity['file.id']:
            continue
        print '\tDownloading file: ', entity['file.name']
        data_file = syn.get(entity['file.id'])
        shutil.copy2(data_file.path, datadir + folders[folder])
        try:
            os.remove(data_file.path)
        except OSError:
            pass
        #syn.cache.remove
    print 'Downloading '+folder+' complete!'

syn.logout()
	################################################################################
	## Download competition data for the
	## ENCODE-DREAM in vivo Transcription Factor Binding Site Prediction Challenge
	################################################################################

	import shutil
	import synapseclient
	from synapseclient import Project, Folder, File
	import sys
	import os

	syn = synapseclient.Synapse()

	# If you haven't set up a .synapseConfig file, you'll have to supply credentials
	syn.login()

	print "Make sure you've accepted the terms of use before running this script!"

	# You may wish to copy these files to a specific destination directory. If so,
	# set the path to that directory here or pass it as an argument to the script.
	datadir = os.environ["DREAM_DATA"] + "/"

	# -------------------------------------------------------------------------------
	# All Challenge Data available for download:
	# (* indicates data from the 'Essential Data Collection' - see https://www.synapse.org/#!Synapse:syn6131484/wiki/402033 )
	# -------------------------------------------------------------------------------
	# * ChIPseq fold_change_signal = syn6181334
	# * ChIPseq labels = syn6181335
	# * ChIPseq peaks conservative = syn6181337
	# * ChIPseq peaks relaxed = syn6181338
	# DNASE bams = syn6176232
	# * DNASE fold_coverage_wiggles = syn6176233
	# * DNASE peaks conservative = syn6176235
	# * DNASE peaks relaxed = syn6176236
	# * RNAseq = syn6176231
	# * annotations = 'syn6184307'
	# -------------------------------------------------------------------------------

	# As written, this script will download the entire Essential Data Collection
	# MODIFY THIS LINE TO INCLUDE THE SYNAPSE IDS OF DATA TYPES YOU WANT TO DOWNLOAD
	folders = {
	'syn7413983': 'ChIPseq/labels', # chip data
	'syn6181334': 'ChIPseq/fold_change_signal', # chip data
	'syn6176233': 'essential_training_data/DNASE/fold_coverage_wiggles', # dnase data
	'syn6176232': 'essential_training_data/DNASE/bams', # dnase data
	'syn6176231': 'RNAseq/', # rnaseq data
	'syn6184307': 'annotations/', # rnaseq data
	'syn8077511': 'ChIPseq/within_cell/', # ChIP-seq data for the within-celltype phase
	'syn8442975': 'ChIPseq/heldout_celltypes/', # ChIP-seq data (Post-challenge release)
	'syn8441154': 'ChIPseq/heldout_chr/', # ChIP-seq data (Post-challenge release)
	}


	for folder in folders:
	# Get folder
	folder_ = syn.get(folder)
	print 'Downloading contents of %s folder (%s)\n' % (folder_.name, folder_.id,)
	# Query for child entities
	query_results = syn.query('select id,name from file where parentId=="%s"' % folder)
	data_files = []
	if not os.path.exists( datadir + folders[folder]):
	os.makedirs(datadir + folders[folder])
	# Download all data files
	for entity in query_results['results']:
	if 'syn7444261' == entity['file.id']:
	continue
	print '\tDownloading file: ', entity['file.name']
	data_file = syn.get(entity['file.id'])
	shutil.copy2(data_file.path, datadir + folders[folder])
	try:
	os.remove(data_file.path)
	except OSError:
	pass
	#syn.cache.remove
	print 'Downloading '+folder+' complete!'

	syn.logout()