Upper level TOBIAS snake
import os
import subprocess
import itertools
import glob
#Set config
if workflow.overwrite_configfile != None:
configfile: str(workflow.overwrite_configfile)
configfile: 'TOBIAS.config'
CONFIGFILE = str(workflow.overwrite_configfile)
#Snake modules used to setup run
include: "snakefiles/helper.snake"
#------------------------- CHECK FORMAT OF CONFIG FILE -------------------------#
required = [("data",),
("run_info", "organism"),
("run_info", "fasta"),
("run_info", "blacklist"),
("run_info", "gtf"),
("run_info", "motifs"),
("run_info", "output"),
#Check if all keys are existing and contain information
for key_list in required:
lookup_dict = config
for key in key_list:
lookup_dict = lookup_dict[key]
if lookup_dict == None:
print("ERROR: Missing input for key {0}".format(key_list))
print("ERROR: Could not find key(s) \"{0}\" in configfile {1}. Please check that your configfile has right format for TOBIAS.".format(":".join(key_list), CONFIGFILE))
#Check if there is at least one condition with bamfiles
if len(config["data"]) > 0:
for condition in config["data"]:
if len(config["data"][condition]) == 0:
print("ERROR: Could not find any bamfiles in \"{0}\" in configfile {1}".format(":".join(("data", condition)), CONFIGFILE))
print("ERROR: Could not find any conditions (\"data:\{condition\}\") in configfile {0}".format(CONFIGFILE))
#------------------------- WHICH FILES/INFO WERE INPUT? ------------------------#
input_files = []
#Files related to experimental data (bam)
CONDITION_IDS = list(config["data"].keys())
for condition in CONDITION_IDS:
if not isinstance(config["data"][condition], list):
config['data'][condition] = [config['data'][condition]]
config["data"][condition] = sum([glob.glob(f) for f in config["data"][condition]], []) #make flat list
config["data"][condition] = list(set(config["data"][condition])) #remove duplicates
#Flatfiles independent from experimental data (run_info)
FASTA = config['run_info']['fasta']
BLACKLIST = config['run_info']['blacklist']
GTF = config['run_info']['gtf']
OUTPUTDIR = config['run_info']["output"]
BLACKLIST = config['run_info']['blacklist']
#MOTIFS = config['run_info']['motifs']
input_files.extend([FASTA, BLACKLIST, GTF])
#---------- Test that input files exist -----------#
for file in input_files:
if file != None:
full_path = os.path.abspath(file)
if not os.path.exists(full_path):
exit("ERROR: The following file given in config does not exist: {0}".format(full_path))
#--------------------------------- MOTIFS ------------------------------#
#If not list, make it list and glob elements
if not isinstance(config['run_info']['motifs'], list):
config['run_info']['motifs'] = [config['run_info']['motifs']]
motif_input = sum([glob.glob(element) for element in config['run_info']['motifs']], [])
#Test if input is directory or file
motif_files = []
for path in motif_input:
#If input is dir; fetch all input files
if os.path.isdir(path):
files = os.listdir(path)
motif_files.extend([os.path.join(path, f) for f in files])
#If input is file, add to list of files
elif os.path.isfile(path):
motif_files = list(set(motif_files)) #remove duplicates
config['run_info']['motifs'] = sorted(motif_files)
#Identify IDS of motifs
for file in motif_files:
full_file = file
with open(full_file) as f:
for line in f:
if line.startswith("MOTIF"):
columns = line.rstrip().split()
ID = columns[2] + "_" + columns[1]
ID = filafy(ID)
elif line.startswith(">"):
columns = line.replace(">", "").rstrip().split()
ID = columns[1] + "_" + columns[0]
ID = filafy(ID)
MOTIF_FILES[ID] = full_file
TF_IDS = list(MOTIF_FILES.keys())
#------------------------ WHICH FILES SHOULD BE CREATED? -----------------------#
output_files = []
id2bam = {condition:{} for condition in CONDITION_IDS}
for condition in CONDITION_IDS:
config_bams = config['data'][condition]
sampleids = [os.path.splitext(os.path.basename(bam))[0] for bam in config_bams]
id2bam[condition] = {sampleids[i]:config_bams[i] for i in range(len(sampleids))} # Link sample ids to bams
PLOTNAMES = expand("{condition}_{plotname}", condition=CONDITION_IDS, plotname=["aggregate"])
if len(CONDITION_IDS) > 1:
PLOTNAMES.extend(["heatmap_comparison", "aggregate_comparison_all", "aggregate_comparison_bound"])
output_files.append(os.path.join(OUTPUTDIR, "config.yaml"))
output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}"), condition=CONDITION_IDS))
#output_files.append(os.path.join(OUTPUTDIR, "TFBS", "bindetect_results.txt"))
#output_files.append(os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt"))
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed"), condition=CONDITION_IDS))
output_files.extend(expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_{plotname}.pdf"), TF=TF_IDS, plotname=PLOTNAMES))
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{plotname}.pdf"), plotname=PLOTNAMES))
output_files.append(os.path.join(OUTPUTDIR, "overview", "TF_changes.pdf"))
output_files.extend(expand(os.path.join(OUTPUTDIR, "wilson", "data", "{TF}_overview.clarion"), TF=TF_IDS))
output_files.append(os.path.join(OUTPUTDIR, "wilson", "HOW_TO_WILSON.txt"))
#------------------------ DEAL WITH SPECIAL ENVIRONMENTS -----------------------#
sys_env = subprocess.check_output(['conda', 'env', 'list'], universal_newlines=True)
env_list = [line.split()[0] for line in sys_env.split("\n") if len(line.split()) > 0]
# default TOBIAS environment
if "TOBIAS_ENV" not in env_list:
print("Creating TOBIAS environment for the first time")["conda", "env", "create", "--file", "environments/tobias.yaml"])
# python 2 related envs
if "MACS_ENV" not in env_list:
print("Creating macs environment for the first time")["conda", "env", "create", "--file", "environments/macs.yaml"])
#---------------------------------- RUN :-) ------------------------------------#
include: "snakefiles/preprocessing.snake"
include: "snakefiles/footprinting.snake"
include: "snakefiles/visualization.snake"
include: "snakefiles/wilson.snake"
rule all:
message: "Rule all"