Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
TOBIAS_snakemake/Snakefile
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
210 lines (169 sloc)
7.98 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Upper level TOBIAS snake | |
""" | |
import os | |
import subprocess | |
import itertools | |
import glob | |
import datetime | |
snakemake.utils.min_version("5.4") #for checkpoints functionality | |
#-------------------------------------------------------------------------------# | |
#---------------------- SETUP PROCESS-RELATED CONFIGURATION --------------------# | |
#-------------------------------------------------------------------------------# | |
try: | |
CONFIGFILE = str(workflow.overwrite_configfiles[0]) | |
except: | |
CONFIGFILE = str(workflow.overwrite_configfile[0]) | |
wd = os.getcwd() | |
config["wd"] = wd | |
#Get relative position of Snakefile from wd | |
SNAKEFILE = workflow.snakefile | |
SNAKEFILE_DIR = os.path.dirname(SNAKEFILE) | |
config["snakefile"] = SNAKEFILE | |
config["snakefile_dir"] = SNAKEFILE_DIR | |
#Establish snakefile and environment dictionaries | |
snakefiles_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "snakefiles")) #directory for additional snakefiles | |
environments_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "environments")) #directory for conda environment yaml files | |
scripts_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "scripts")) #directory for extra scripts used in workflow | |
#Snake modules used to setup run | |
include: os.path.join(snakefiles_dir, "helper.snake") | |
shell.executable("/bin/bash") | |
#shell.prefix("source ~/.bashrc; ") | |
#Constrain wildcards to not jump into directories | |
wildcard_constraints: | |
condition = "[a-zA-Z0-9\-_\.]+", | |
TF = "[a-zA-Z0-9\-_\.]+", | |
state = "bound|unbound", | |
plotname = "[a-zA-Z0-9\-_\.]+", | |
#Save timestamp to config | |
config["timestamp"] = str(datetime.datetime.now()) | |
#-------------------------------------------------------------------------------# | |
#------------------------- CHECK FORMAT OF CONFIG FILE -------------------------# | |
#-------------------------------------------------------------------------------# | |
required = [("data",), | |
("run_info",), | |
("run_info", "organism"), | |
("run_info", "fasta"), | |
("run_info", "blacklist"), | |
("run_info", "gtf"), | |
("run_info", "motifs"), | |
("run_info", "output"), | |
] | |
#Check if all keys are existing and contain information | |
for key_list in required: | |
lookup_dict = config | |
for key in key_list: | |
try: | |
lookup_dict = lookup_dict[key] | |
if lookup_dict == None: | |
print("ERROR: Missing input for key {0}".format(key_list)) | |
except: | |
print("ERROR: Could not find key(s) \"{0}\" in configfile {1}. Please check that your configfile has right format for TOBIAS.".format(":".join(key_list), CONFIGFILE)) | |
sys.exit() | |
#Check if there is at least one condition with bamfiles | |
if len(config["data"]) > 0: | |
for condition in config["data"]: | |
if len(config["data"][condition]) == 0: | |
print("ERROR: Could not find any bamfiles in \"{0}\" in configfile {1}".format(":".join(("data", condition)), CONFIGFILE)) | |
else: | |
print("ERROR: Could not find any conditions (\"data:\{condition\}\") in configfile {0}".format(CONFIGFILE)) | |
sys.exit() | |
#-------------------------------------------------------------------------------# | |
#------------------------- WHICH FILES/INFO WERE INPUT? ------------------------# | |
#-------------------------------------------------------------------------------# | |
input_files = [] | |
#Files related to experimental data (bam) | |
CONDITION_IDS = list(config["data"].keys()) | |
for condition in CONDITION_IDS: | |
if not isinstance(config["data"][condition], list): | |
config['data'][condition] = [config['data'][condition]] | |
cond_input = [] | |
for f in config['data'][condition]: | |
globbed = glob.glob(f) | |
if len(globbed) == 0: | |
exit("ERROR: Could not find any files matching filename/pattern: {0}".format(f)) | |
else: | |
cond_input.extend(globbed) | |
config["data"][condition] = list(set(cond_input)) #remove duplicates | |
input_files.extend(config['data'][condition]) | |
#Flatfiles independent from experimental data | |
OUTPUTDIR = config['run_info']["output"] | |
FASTA = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['fasta'])) | |
BLACKLIST = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['blacklist'])) | |
GTF = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['gtf'])) | |
#---------- Test that input files exist -----------# | |
input_files.extend([config['run_info']['fasta'], config['run_info']['blacklist'], config['run_info']['gtf']]) | |
for file in input_files: | |
if file != None: | |
full_path = os.path.abspath(file) | |
if not os.path.exists(full_path): | |
exit("ERROR: The following file given in config does not exist: {0}".format(full_path)) | |
#--------------------------------- MOTIFS ------------------------------# | |
#If not list, make it list and glob elements | |
if not isinstance(config['run_info']['motifs'], list): | |
config['run_info']['motifs'] = [config['run_info']['motifs']] | |
motif_input = sum([glob.glob(element) for element in config['run_info']['motifs']], []) | |
#Test if input is directory or file | |
motif_files = [] | |
for path in motif_input: | |
#If input is dir; fetch all input files | |
if os.path.isdir(path): | |
files = os.listdir(path) | |
motif_files.extend([os.path.join(path, f) for f in files]) | |
#If input is file, add to list of files | |
elif os.path.isfile(path): | |
motif_files.append(path) | |
motif_files = list(set(motif_files)) #remove duplicates | |
config['run_info']['motifs'] = sorted(motif_files) | |
#Identify IDS of motifs | |
MOTIF_FILES = {} | |
for file in motif_files: | |
full_file = file | |
with open(full_file) as f: | |
for line in f: | |
if line.startswith("MOTIF"): | |
columns = line.rstrip().split() | |
ID = columns[2] + "_" + columns[1] | |
ID = filafy(ID) | |
elif line.startswith(">"): | |
columns = line.replace(">", "").rstrip().split() | |
ID = columns[1] + "_" + columns[0] | |
ID = filafy(ID) | |
else: | |
continue #no motif names on this line; read next line | |
MOTIF_FILES[ID] = full_file | |
TF_IDS = list(MOTIF_FILES.keys()) | |
#-------------------------------------------------------------------------------# | |
#------------------------ WHICH FILES SHOULD BE CREATED? -----------------------# | |
#-------------------------------------------------------------------------------# | |
output_files = [] | |
id2bam = {condition:{} for condition in CONDITION_IDS} | |
for condition in CONDITION_IDS: | |
config_bams = config['data'][condition] | |
sampleids = [os.path.splitext(os.path.basename(bam))[0] for bam in config_bams] | |
id2bam[condition] = {sampleids[i]:config_bams[i] for i in range(len(sampleids))} # Link sample ids to bams | |
PLOTNAMES = expand("{condition}_{plotname}", condition=CONDITION_IDS, plotname=["aggregate"]) | |
if len(CONDITION_IDS) > 1: | |
PLOTNAMES.extend(["heatmap_comparison", "aggregate_comparison_all", "aggregate_comparison_bound"]) | |
if len(CONDITION_IDS) < 5: #Only show venns for 4 conditions or less | |
PLOTNAMES.append("venn_diagram") | |
output_files.append(os.path.join(OUTPUTDIR, "config.yaml")) | |
output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS)) | |
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed"), condition=CONDITION_IDS)) | |
#Visualization | |
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{plotname}.pdf"), plotname=PLOTNAMES)) | |
output_files.append(os.path.join(OUTPUTDIR, "overview", "TF_changes.pdf")) | |
#Wilson | |
#output_files.extend(expand(os.path.join(OUTPUTDIR, "wilson", "data", "{TF}_overview.clarion"), TF=TF_IDS)) | |
output_files.append(os.path.join(OUTPUTDIR, "wilson", "HOW_TO_WILSON.txt")) | |
#-------------------------------------------------------------------------------# | |
#---------------------------------- RUN :-) ------------------------------------# | |
#-------------------------------------------------------------------------------# | |
include: os.path.join(snakefiles_dir, "preprocessing.snake") | |
include: os.path.join(snakefiles_dir, "footprinting.snake") | |
include: os.path.join(snakefiles_dir, "visualization.snake") | |
include: os.path.join(snakefiles_dir, "wilson.snake") | |
rule all: | |
input: | |
output_files | |
message: "Rule all" |