Snakefile

"""
Upper level TOBIAS snake
"""

import os
import subprocess
import itertools
import glob
import datetime

snakemake.utils.min_version("5.4")	#for checkpoints functionality

#-------------------------------------------------------------------------------#
#---------------------- SETUP PROCESS-RELATED CONFIGURATION --------------------#
#-------------------------------------------------------------------------------#
try:
	CONFIGFILE = str(workflow.overwrite_configfiles[0])
except:
	CONFIGFILE = str(workflow.overwrite_configfile[0])

wd = os.getcwd()
config["wd"] = wd

#Get relative position of Snakefile from wd
SNAKEFILE = workflow.snakefile
SNAKEFILE_DIR = os.path.dirname(SNAKEFILE)
config["snakefile"] = SNAKEFILE
config["snakefile_dir"] = SNAKEFILE_DIR

#Establish snakefile and environment dictionaries
snakefiles_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "snakefiles"))		#directory for additional snakefiles
environments_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "environments"))	#directory for conda environment yaml files
scripts_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "scripts")) 			#directory for extra scripts used in workflow

#Snake modules used to setup run
include: os.path.join(snakefiles_dir, "helper.snake")
shell.executable("/bin/bash")
#shell.prefix("source ~/.bashrc; ")

#Constrain wildcards to not jump into directories
wildcard_constraints:
	condition = "[a-zA-Z0-9\-_\.]+",
	TF = "[a-zA-Z0-9\-_\.]+",
	state = "bound|unbound",
	plotname = "[a-zA-Z0-9\-_\.]+",

#Save timestamp to config
config["timestamp"] = str(datetime.datetime.now())

#-------------------------------------------------------------------------------#
#------------------------- CHECK FORMAT OF CONFIG FILE -------------------------#
#-------------------------------------------------------------------------------#

required = [("data",),
			("run_info",),
				("run_info", "organism"),
				("run_info", "fasta"),
				("run_info", "blacklist"),
				("run_info", "gtf"),
				("run_info", "motifs"),
				("run_info", "output"),
			]

#Check if all keys are existing and contain information
for key_list in required:
	lookup_dict = config
	for key in key_list:
		try:
			lookup_dict = lookup_dict[key]
			if lookup_dict == None:
				print("ERROR: Missing input for key {0}".format(key_list))
		except:
			print("ERROR: Could not find key(s) \"{0}\" in configfile {1}. Please check that your configfile has right format for TOBIAS.".format(":".join(key_list), CONFIGFILE))
			sys.exit()

#Check if there is at least one condition with bamfiles
if len(config["data"]) > 0:
	for condition in config["data"]:
		if len(config["data"][condition]) == 0:
			print("ERROR: Could not find any bamfiles in \"{0}\" in configfile {1}".format(":".join(("data", condition)), CONFIGFILE))
else:
	print("ERROR: Could not find any conditions (\"data:\{condition\}\") in configfile {0}".format(CONFIGFILE))
	sys.exit()

#-------------------------------------------------------------------------------#
#------------------------- WHICH FILES/INFO WERE INPUT? ------------------------#
#-------------------------------------------------------------------------------#

input_files = []

#Files related to experimental data (bam)
CONDITION_IDS = list(config["data"].keys())
for condition in CONDITION_IDS:
	if not isinstance(config["data"][condition], list):
		config['data'][condition] = [config['data'][condition]]

	cond_input = []
	for f in config['data'][condition]:
		globbed = glob.glob(f)
		if len(globbed) == 0:
			exit("ERROR: Could not find any files matching filename/pattern: {0}".format(f))
		else:
			cond_input.extend(globbed)

	config["data"][condition] = list(set(cond_input))						#remove duplicates
	input_files.extend(config['data'][condition])

#Flatfiles independent from experimental data
OUTPUTDIR = config['run_info']["output"]
FASTA = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['fasta']))
BLACKLIST = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['blacklist']))
GTF = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['gtf']))

#---------- Test that input files exist -----------#

input_files.extend([config['run_info']['fasta'], config['run_info']['blacklist'], config['run_info']['gtf']])
for file in input_files:
	if file != None:
		full_path = os.path.abspath(file)
		if not os.path.exists(full_path):
			exit("ERROR: The following file given in config does not exist: {0}".format(full_path))

#--------------------------------- MOTIFS ------------------------------#

#If not list, make it list and glob elements
if not isinstance(config['run_info']['motifs'], list):
	config['run_info']['motifs'] = [config['run_info']['motifs']]
motif_input = sum([glob.glob(element) for element in config['run_info']['motifs']], [])

#Test if input is directory or file
motif_files = []
for path in motif_input:

	#If input is dir; fetch all input files
	if os.path.isdir(path):
		files = os.listdir(path)
		motif_files.extend([os.path.join(path, f) for f in files])

	#If input is file, add to list of files
	elif os.path.isfile(path):
		motif_files.append(path)

motif_files = list(set(motif_files)) 	#remove duplicates
config['run_info']['motifs'] = sorted(motif_files)

#Identify IDS of motifs
MOTIF_FILES = {}
for file in motif_files:
	full_file = file
	with open(full_file) as f:
		for line in f:
			if line.startswith("MOTIF"):
				columns = line.rstrip().split()
				ID = columns[2] + "_" + columns[1]
				ID = filafy(ID)
			elif line.startswith(">"):
				columns = line.replace(">", "").rstrip().split()
				ID = columns[1] + "_" + columns[0]
				ID = filafy(ID)
			else:
				continue	#no motif names on this line; read next line
			MOTIF_FILES[ID] = full_file

TF_IDS = list(MOTIF_FILES.keys())

#-------------------------------------------------------------------------------#
#------------------------ WHICH FILES SHOULD BE CREATED? -----------------------#
#-------------------------------------------------------------------------------#

output_files = []

id2bam = {condition:{} for condition in CONDITION_IDS}
for condition in CONDITION_IDS:
	config_bams = config['data'][condition]
	sampleids = [os.path.splitext(os.path.basename(bam))[0] for bam in config_bams]
	id2bam[condition] = {sampleids[i]:config_bams[i] for i in range(len(sampleids))}	# Link sample ids to bams

PLOTNAMES = expand("{condition}_{plotname}", condition=CONDITION_IDS, plotname=["aggregate"])
if len(CONDITION_IDS) > 1:
	PLOTNAMES.extend(["heatmap_comparison", "aggregate_comparison_all", "aggregate_comparison_bound"])
	if len(CONDITION_IDS) < 5:	#Only show venns for 4 conditions or less
		PLOTNAMES.append("venn_diagram")


output_files.append(os.path.join(OUTPUTDIR, "config.yaml"))
output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS))
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed"), condition=CONDITION_IDS))

#Visualization
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{plotname}.pdf"), plotname=PLOTNAMES))
output_files.append(os.path.join(OUTPUTDIR, "overview", "TF_changes.pdf"))

#Wilson
#output_files.extend(expand(os.path.join(OUTPUTDIR, "wilson", "data", "{TF}_overview.clarion"), TF=TF_IDS))
output_files.append(os.path.join(OUTPUTDIR, "wilson", "HOW_TO_WILSON.txt"))


#-------------------------------------------------------------------------------#
#---------------------------------- RUN :-) ------------------------------------#
#-------------------------------------------------------------------------------#

include: os.path.join(snakefiles_dir, "preprocessing.snake")
include: os.path.join(snakefiles_dir, "footprinting.snake")
include: os.path.join(snakefiles_dir, "visualization.snake")
include: os.path.join(snakefiles_dir, "wilson.snake")

rule all:
	input:
		output_files
	message: "Rule all"
	"""
	Upper level TOBIAS snake
	"""

	import os
	import subprocess
	import itertools
	import glob
	import datetime

	snakemake.utils.min_version("5.4") #for checkpoints functionality

	#-------------------------------------------------------------------------------#
	#---------------------- SETUP PROCESS-RELATED CONFIGURATION --------------------#
	#-------------------------------------------------------------------------------#
	try:
	CONFIGFILE = str(workflow.overwrite_configfiles[0])
	except:
	CONFIGFILE = str(workflow.overwrite_configfile[0])

	wd = os.getcwd()
	config["wd"] = wd

	#Get relative position of Snakefile from wd
	SNAKEFILE = workflow.snakefile
	SNAKEFILE_DIR = os.path.dirname(SNAKEFILE)
	config["snakefile"] = SNAKEFILE
	config["snakefile_dir"] = SNAKEFILE_DIR

	#Establish snakefile and environment dictionaries
	snakefiles_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "snakefiles")) #directory for additional snakefiles
	environments_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "environments")) #directory for conda environment yaml files
	scripts_dir = os.path.abspath(os.path.join(SNAKEFILE_DIR, "scripts")) #directory for extra scripts used in workflow

	#Snake modules used to setup run
	include: os.path.join(snakefiles_dir, "helper.snake")
	shell.executable("/bin/bash")
	#shell.prefix("source ~/.bashrc; ")

	#Constrain wildcards to not jump into directories
	wildcard_constraints:
	condition = "[a-zA-Z0-9\-_\.]+",
	TF = "[a-zA-Z0-9\-_\.]+",
	state = "bound\|unbound",
	plotname = "[a-zA-Z0-9\-_\.]+",

	#Save timestamp to config
	config["timestamp"] = str(datetime.datetime.now())

	#-------------------------------------------------------------------------------#
	#------------------------- CHECK FORMAT OF CONFIG FILE -------------------------#
	#-------------------------------------------------------------------------------#

	required = [("data",),
	("run_info",),
	("run_info", "organism"),
	("run_info", "fasta"),
	("run_info", "blacklist"),
	("run_info", "gtf"),
	("run_info", "motifs"),
	("run_info", "output"),
	]

	#Check if all keys are existing and contain information
	for key_list in required:
	lookup_dict = config
	for key in key_list:
	try:
	lookup_dict = lookup_dict[key]
	if lookup_dict == None:
	print("ERROR: Missing input for key {0}".format(key_list))
	except:
	print("ERROR: Could not find key(s) \"{0}\" in configfile {1}. Please check that your configfile has right format for TOBIAS.".format(":".join(key_list), CONFIGFILE))
	sys.exit()

	#Check if there is at least one condition with bamfiles
	if len(config["data"]) > 0:
	for condition in config["data"]:
	if len(config["data"][condition]) == 0:
	print("ERROR: Could not find any bamfiles in \"{0}\" in configfile {1}".format(":".join(("data", condition)), CONFIGFILE))
	else:
	print("ERROR: Could not find any conditions (\"data:\{condition\}\") in configfile {0}".format(CONFIGFILE))
	sys.exit()

	#-------------------------------------------------------------------------------#
	#------------------------- WHICH FILES/INFO WERE INPUT? ------------------------#
	#-------------------------------------------------------------------------------#

	input_files = []

	#Files related to experimental data (bam)
	CONDITION_IDS = list(config["data"].keys())
	for condition in CONDITION_IDS:
	if not isinstance(config["data"][condition], list):
	config['data'][condition] = [config['data'][condition]]

	cond_input = []
	for f in config['data'][condition]:
	globbed = glob.glob(f)
	if len(globbed) == 0:
	exit("ERROR: Could not find any files matching filename/pattern: {0}".format(f))
	else:
	cond_input.extend(globbed)

	config["data"][condition] = list(set(cond_input)) #remove duplicates
	input_files.extend(config['data'][condition])

	#Flatfiles independent from experimental data
	OUTPUTDIR = config['run_info']["output"]
	FASTA = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['fasta']))
	BLACKLIST = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['blacklist']))
	GTF = os.path.join(OUTPUTDIR, "flatfiles", os.path.basename(config['run_info']['gtf']))

	#---------- Test that input files exist -----------#

	input_files.extend([config['run_info']['fasta'], config['run_info']['blacklist'], config['run_info']['gtf']])
	for file in input_files:
	if file != None:
	full_path = os.path.abspath(file)
	if not os.path.exists(full_path):
	exit("ERROR: The following file given in config does not exist: {0}".format(full_path))

	#--------------------------------- MOTIFS ------------------------------#

	#If not list, make it list and glob elements
	if not isinstance(config['run_info']['motifs'], list):
	config['run_info']['motifs'] = [config['run_info']['motifs']]
	motif_input = sum([glob.glob(element) for element in config['run_info']['motifs']], [])

	#Test if input is directory or file
	motif_files = []
	for path in motif_input:

	#If input is dir; fetch all input files
	if os.path.isdir(path):
	files = os.listdir(path)
	motif_files.extend([os.path.join(path, f) for f in files])

	#If input is file, add to list of files
	elif os.path.isfile(path):
	motif_files.append(path)

	motif_files = list(set(motif_files)) #remove duplicates
	config['run_info']['motifs'] = sorted(motif_files)

	#Identify IDS of motifs
	MOTIF_FILES = {}
	for file in motif_files:
	full_file = file
	with open(full_file) as f:
	for line in f:
	if line.startswith("MOTIF"):
	columns = line.rstrip().split()
	ID = columns[2] + "_" + columns[1]
	ID = filafy(ID)
	elif line.startswith(">"):
	columns = line.replace(">", "").rstrip().split()
	ID = columns[1] + "_" + columns[0]
	ID = filafy(ID)
	else:
	continue #no motif names on this line; read next line
	MOTIF_FILES[ID] = full_file

	TF_IDS = list(MOTIF_FILES.keys())

	#-------------------------------------------------------------------------------#
	#------------------------ WHICH FILES SHOULD BE CREATED? -----------------------#
	#-------------------------------------------------------------------------------#

	output_files = []

	id2bam = {condition:{} for condition in CONDITION_IDS}
	for condition in CONDITION_IDS:
	config_bams = config['data'][condition]
	sampleids = [os.path.splitext(os.path.basename(bam))[0] for bam in config_bams]
	id2bam[condition] = {sampleids[i]:config_bams[i] for i in range(len(sampleids))} # Link sample ids to bams

	PLOTNAMES = expand("{condition}_{plotname}", condition=CONDITION_IDS, plotname=["aggregate"])
	if len(CONDITION_IDS) > 1:
	PLOTNAMES.extend(["heatmap_comparison", "aggregate_comparison_all", "aggregate_comparison_bound"])
	if len(CONDITION_IDS) < 5: #Only show venns for 4 conditions or less
	PLOTNAMES.append("venn_diagram")


	output_files.append(os.path.join(OUTPUTDIR, "config.yaml"))
	output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS))
	output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed"), condition=CONDITION_IDS))

	#Visualization
	output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{plotname}.pdf"), plotname=PLOTNAMES))
	output_files.append(os.path.join(OUTPUTDIR, "overview", "TF_changes.pdf"))

	#Wilson
	#output_files.extend(expand(os.path.join(OUTPUTDIR, "wilson", "data", "{TF}_overview.clarion"), TF=TF_IDS))
	output_files.append(os.path.join(OUTPUTDIR, "wilson", "HOW_TO_WILSON.txt"))


	#-------------------------------------------------------------------------------#
	#---------------------------------- RUN :-) ------------------------------------#
	#-------------------------------------------------------------------------------#

	include: os.path.join(snakefiles_dir, "preprocessing.snake")
	include: os.path.join(snakefiles_dir, "footprinting.snake")
	include: os.path.join(snakefiles_dir, "visualization.snake")
	include: os.path.join(snakefiles_dir, "wilson.snake")

	rule all:
	input:
	output_files
	message: "Rule all"