Skip to content
Permalink
5ccee944f2
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
210 lines (169 sloc) 7.98 KB
"""
Upper level TOBIAS snake
"""
import os
import subprocess
import itertools
import glob
import datetime
snakemake.utils.min_version("5.4") #for checkpoints functionality
#-------------------------------------------------------------------------------#
#---------------------- SETUP PROCESS-RELATED CONFIGURATION --------------------#
#-------------------------------------------------------------------------------#
try:
CONFIGFILE = str(workflow.overwrite_configfiles[0])
except:
CONFIGFILE = str(workflow.overwrite_configfile[0])
wd = os.getcwd()
config["wd"] = wd
#Get relative position of Snakefile from wd
SNAKEFILE = workflow.snakefile
SNAKEFILE_DIR = os.path.dirname(SNAKEFILE)
config["snakefile"] = SNAKEFILE
config["snakefile_dir"] = SNAKEFILE_DIR
#Establish snakefile and environment dictionaries
snakefiles_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "snakefiles")) #directory for additional snakefiles
environments_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "environments")) #directory for conda environment yaml files
scripts_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "scripts")) #directory for extra scripts used in workflow
#Snake modules used to setup run
include: os.path.join(snakefiles_dir, "helper.snake")
shell.executable("/bin/bash")
#shell.prefix("source ~/.bashrc; ")
#Constrain wildcards to not jump into directories
wildcard_constraints:
condition = "[a-zA-Z0-9\-_\.]+",
TF = "[a-zA-Z0-9\-_\.]+",
state = "bound|unbound",
plotname = "[a-zA-Z0-9\-_\.]+",
#Save timestamp to config
config["timestamp"] = str(datetime.datetime.now())
#-------------------------------------------------------------------------------#
#------------------------- CHECK FORMAT OF CONFIG FILE -------------------------#
#-------------------------------------------------------------------------------#
required = [("data",),
("run_info",),
("run_info", "organism"),
("run_info", "fasta"),
("run_info", "blacklist"),
("run_info", "gtf"),
("run_info", "motifs"),
("run_info", "output"),
]
#Check if all keys are existing and contain information
for key_list in required:
lookup_dict = config
for key in key_list:
try:
lookup_dict = lookup_dict[key]
if lookup_dict == None:
print("ERROR: Missing input for key {0}".format(key_list))
except:
print("ERROR: Could not find key(s) \"{0}\" in configfile {1}. Please check that your configfile has right format for TOBIAS.".format(":".join(key_list), CONFIGFILE))
sys.exit()
#Check if there is at least one condition with bamfiles
if len(config["data"]) > 0:
for condition in config["data"]:
if len(config["data"][condition]) == 0:
print("ERROR: Could not find any bamfiles in \"{0}\" in configfile {1}".format(":".join(("data", condition)), CONFIGFILE))
else:
print("ERROR: Could not find any conditions (\"data:\{condition\}\") in configfile {0}".format(CONFIGFILE))
sys.exit()
#-------------------------------------------------------------------------------#
#------------------------- WHICH FILES/INFO WERE INPUT? ------------------------#
#-------------------------------------------------------------------------------#
input_files = []
#Files related to experimental data (bam)
CONDITION_IDS = list(config["data"].keys())
for condition in CONDITION_IDS:
if not isinstance(config["data"][condition], list):
config['data'][condition] = [config['data'][condition]]
cond_input = []
for f in config['data'][condition]:
globbed = glob.glob(f)
if len(globbed) == 0:
exit("ERROR: Could not find any files matching filename/pattern: {0}".format(f))
else:
cond_input.extend(globbed)
config["data"][condition] = list(set(cond_input)) #remove duplicates
input_files.extend(config['data'][condition])
#Flatfiles independent from experimental data
OUTPUTDIR = config['run_info']["output"]
FASTA = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['fasta']))
BLACKLIST = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['blacklist']))
GTF = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['gtf']))
#---------- Test that input files exist -----------#
input_files.extend([config['run_info']['fasta'], config['run_info']['blacklist'], config['run_info']['gtf']])
for file in input_files:
if file != None:
full_path = os.path.abspath(file)
if not os.path.exists(full_path):
exit("ERROR: The following file given in config does not exist: {0}".format(full_path))
#--------------------------------- MOTIFS ------------------------------#
#If not list, make it list and glob elements
if not isinstance(config['run_info']['motifs'], list):
config['run_info']['motifs'] = [config['run_info']['motifs']]
motif_input = sum([glob.glob(element) for element in config['run_info']['motifs']], [])
#Test if input is directory or file
motif_files = []
for path in motif_input:
#If input is dir; fetch all input files
if os.path.isdir(path):
files = os.listdir(path)
motif_files.extend([os.path.join(path, f) for f in files])
#If input is file, add to list of files
elif os.path.isfile(path):
motif_files.append(path)
motif_files = list(set(motif_files)) #remove duplicates
config['run_info']['motifs'] = sorted(motif_files)
#Identify IDS of motifs
MOTIF_FILES = {}
for file in motif_files:
full_file = file
with open(full_file) as f:
for line in f:
if line.startswith("MOTIF"):
columns = line.rstrip().split()
ID = columns[2] + "_" + columns[1]
ID = filafy(ID)
elif line.startswith(">"):
columns = line.replace(">", "").rstrip().split()
ID = columns[1] + "_" + columns[0]
ID = filafy(ID)
else:
continue #no motif names on this line; read next line
MOTIF_FILES[ID] = full_file
TF_IDS = list(MOTIF_FILES.keys())
#-------------------------------------------------------------------------------#
#------------------------ WHICH FILES SHOULD BE CREATED? -----------------------#
#-------------------------------------------------------------------------------#
output_files = []
id2bam = {condition:{} for condition in CONDITION_IDS}
for condition in CONDITION_IDS:
config_bams = config['data'][condition]
sampleids = [os.path.splitext(os.path.basename(bam))[0] for bam in config_bams]
id2bam[condition] = {sampleids[i]:config_bams[i] for i in range(len(sampleids))} # Link sample ids to bams
PLOTNAMES = expand("{condition}_{plotname}", condition=CONDITION_IDS, plotname=["aggregate"])
if len(CONDITION_IDS) > 1:
PLOTNAMES.extend(["heatmap_comparison", "aggregate_comparison_all", "aggregate_comparison_bound"])
if len(CONDITION_IDS) < 5: #Only show venns for 4 conditions or less
PLOTNAMES.append("venn_diagram")
output_files.append(os.path.join(OUTPUTDIR, "config.yaml"))
output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS))
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed"), condition=CONDITION_IDS))
#Visualization
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{plotname}.pdf"), plotname=PLOTNAMES))
output_files.append(os.path.join(OUTPUTDIR, "overview", "TF_changes.pdf"))
#Wilson
#output_files.extend(expand(os.path.join(OUTPUTDIR, "wilson", "data", "{TF}_overview.clarion"), TF=TF_IDS))
output_files.append(os.path.join(OUTPUTDIR, "wilson", "HOW_TO_WILSON.txt"))
#-------------------------------------------------------------------------------#
#---------------------------------- RUN :-) ------------------------------------#
#-------------------------------------------------------------------------------#
include: os.path.join(snakefiles_dir, "preprocessing.snake")
include: os.path.join(snakefiles_dir, "footprinting.snake")
include: os.path.join(snakefiles_dir, "visualization.snake")
include: os.path.join(snakefiles_dir, "wilson.snake")
rule all:
input:
output_files
message: "Rule all"