Snakefile

"""
Upper level TOBIAS snake
"""

import os
import subprocess
import itertools
import glob

#Set config
if workflow.overwrite_configfile != None:
	configfile: str(workflow.overwrite_configfile)
else:
	configfile: 'TOBIAS.config'
CONFIGFILE = str(workflow.overwrite_configfile)

#Snake modules used to setup run
include: "snakefiles/helper.snake"
#shell.prefix("")

#-------------------------------------------------------------------------------#
#------------------------- CHECK FORMAT OF CONFIG FILE -------------------------#
#-------------------------------------------------------------------------------#

required = [("data",),
			("run_info",),
				("run_info", "organism"),
				("run_info", "fasta"),
				("run_info", "blacklist"),
				("run_info", "gtf"),
				("run_info", "motifs"),
				("run_info", "output"),
			]

#Check if all keys are existing and contain information
for key_list in required:
	lookup_dict = config
	for key in key_list:
		try:
			lookup_dict = lookup_dict[key]
			if lookup_dict == None:
				print("ERROR: Missing input for key {0}".format(key_list))
		except:
			print("ERROR: Could not find key(s) \"{0}\" in configfile {1}. Please check that your configfile has right format for TOBIAS.".format(":".join(key_list), CONFIGFILE))
			sys.exit()

#Check if there is at least one condition with bamfiles
if len(config["data"]) > 0:
	for condition in config["data"]:
		if len(config["data"][condition]) == 0:
			print("ERROR: Could not find any bamfiles in \"{0}\" in configfile {1}".format(":".join(("data", condition)), CONFIGFILE))
else:
	print("ERROR: Could not find any conditions (\"data:\{condition\}\") in configfile {0}".format(CONFIGFILE))
	sys.exit()

#-------------------------------------------------------------------------------#
#------------------------- WHICH FILES/INFO WERE INPUT? ------------------------#
#-------------------------------------------------------------------------------#

input_files = []

#Files related to experimental data (bam)
CONDITION_IDS = list(config["data"].keys())
for condition in CONDITION_IDS:
	if not isinstance(config["data"][condition], list):
		config['data'][condition] = [config['data'][condition]]
	config["data"][condition] = sum([glob.glob(f) for f in config["data"][condition]], [])	#make flat list
	config["data"][condition] = list(set(config["data"][condition]))						#remove duplicates
	input_files.extend(config['data'][condition])

#Flatfiles independent from experimental data (run_info)
FASTA = config['run_info']['fasta']
BLACKLIST = config['run_info']['blacklist']
GTF = config['run_info']['gtf']
OUTPUTDIR = config['run_info']["output"]
BLACKLIST = config['run_info']['blacklist']
#MOTIFS = config['run_info']['motifs']

input_files.extend([FASTA, BLACKLIST, GTF])

#---------- Test that input files exist -----------#
for file in input_files:
	if file != None:
		full_path = os.path.abspath(file)
		if not os.path.exists(full_path):
			exit("ERROR: The following file given in config does not exist: {0}".format(full_path))


#--------------------------------- MOTIFS ------------------------------#

#If not list, make it list and glob elements
if not isinstance(config['run_info']['motifs'], list):
	config['run_info']['motifs'] = [config['run_info']['motifs']]
motif_input = sum([glob.glob(element) for element in config['run_info']['motifs']], [])

#Test if input is directory or file
motif_files = []
for path in motif_input:

	#If input is dir; fetch all input files
	if os.path.isdir(path):
		files = os.listdir(path)
		motif_files.extend([os.path.join(path, f) for f in files])

	#If input is file, add to list of files
	elif os.path.isfile(path):
		motif_files.append(path)

motif_files = list(set(motif_files)) #remove duplicates
config['run_info']['motifs'] = sorted(motif_files)

#Identify IDS of motifs
MOTIF_FILES = {}
for file in motif_files:
	full_file = file
	with open(full_file) as f:
		for line in f:
			if line.startswith("MOTIF"):
				columns = line.rstrip().split()
				ID = columns[2] + "_" + columns[1]
				ID = filafy(ID)
			elif line.startswith(">"):
				columns = line.replace(">", "").rstrip().split()
				ID = columns[1] + "_" + columns[0]
				ID = filafy(ID)
		MOTIF_FILES[ID] = full_file

TF_IDS = list(MOTIF_FILES.keys())

#-------------------------------------------------------------------------------#
#------------------------ WHICH FILES SHOULD BE CREATED? -----------------------#
#-------------------------------------------------------------------------------#

output_files = []

id2bam = {condition:{} for condition in CONDITION_IDS}
for condition in CONDITION_IDS:
	config_bams = config['data'][condition]
	sampleids = [os.path.splitext(os.path.basename(bam))[0] for bam in config_bams]
	id2bam[condition] = {sampleids[i]:config_bams[i] for i in range(len(sampleids))}	# Link sample ids to bams

PLOTNAMES = expand("{condition}_{plotname}", condition=CONDITION_IDS, plotname=["aggregate"])
if len(CONDITION_IDS) > 1:
	PLOTNAMES.extend(["heatmap_comparison", "aggregate_comparison_all", "aggregate_comparison_bound"])

output_files.append(os.path.join(OUTPUTDIR, "config.yaml"))
output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS))

#output_files.append(os.path.join(OUTPUTDIR, "TFBS", "bindetect_results.txt"))
#output_files.append(os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt"))
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed"), condition=CONDITION_IDS))

#Visualization
output_files.extend(expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_{plotname}.pdf"), TF=TF_IDS, plotname=PLOTNAMES))
output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{plotname}.pdf"), plotname=PLOTNAMES))
output_files.append(os.path.join(OUTPUTDIR, "overview", "TF_changes.pdf"))

#Wilson
output_files.extend(expand(os.path.join(OUTPUTDIR, "wilson", "data", "{TF}_overview.clarion"), TF=TF_IDS))
output_files.append(os.path.join(OUTPUTDIR, "wilson", "HOW_TO_WILSON.txt"))

#-------------------------------------------------------------------------------#
#------------------------ DEAL WITH SPECIAL ENVIRONMENTS -----------------------#
#-------------------------------------------------------------------------------#

sys_env = subprocess.check_output(['conda', 'env', 'list'], universal_newlines=True)
env_list = [line.split()[0] for line in sys_env.split("\n") if len(line.split()) > 0]

# default TOBIAS environment
if "TOBIAS_ENV" not in env_list:
	print("Creating TOBIAS environment for the first time")
	subprocess.call(["conda", "env", "create", "--file", "environments/tobias.yaml"])

# python 2 related envs
if "MACS_ENV" not in env_list:
	print("Creating macs environment for the first time")
	subprocess.call(["conda", "env", "create", "--file", "environments/macs.yaml"])


#-------------------------------------------------------------------------------#
#---------------------------------- RUN :-) ------------------------------------#
#-------------------------------------------------------------------------------#

include: "snakefiles/preprocessing.snake"
include: "snakefiles/footprinting.snake"
include: "snakefiles/visualization.snake"
include: "snakefiles/wilson.snake"

rule all:
	input:
		output_files
	message: "Rule all"
	"""
	Upper level TOBIAS snake
	"""

	import os
	import subprocess
	import itertools
	import glob

	#Set config
	if workflow.overwrite_configfile != None:
	configfile: str(workflow.overwrite_configfile)
	else:
	configfile: 'TOBIAS.config'
	CONFIGFILE = str(workflow.overwrite_configfile)

	#Snake modules used to setup run
	include: "snakefiles/helper.snake"
	#shell.prefix("")

	#-------------------------------------------------------------------------------#
	#------------------------- CHECK FORMAT OF CONFIG FILE -------------------------#
	#-------------------------------------------------------------------------------#

	required = [("data",),
	("run_info",),
	("run_info", "organism"),
	("run_info", "fasta"),
	("run_info", "blacklist"),
	("run_info", "gtf"),
	("run_info", "motifs"),
	("run_info", "output"),
	]

	#Check if all keys are existing and contain information
	for key_list in required:
	lookup_dict = config
	for key in key_list:
	try:
	lookup_dict = lookup_dict[key]
	if lookup_dict == None:
	print("ERROR: Missing input for key {0}".format(key_list))
	except:
	print("ERROR: Could not find key(s) \"{0}\" in configfile {1}. Please check that your configfile has right format for TOBIAS.".format(":".join(key_list), CONFIGFILE))
	sys.exit()

	#Check if there is at least one condition with bamfiles
	if len(config["data"]) > 0:
	for condition in config["data"]:
	if len(config["data"][condition]) == 0:
	print("ERROR: Could not find any bamfiles in \"{0}\" in configfile {1}".format(":".join(("data", condition)), CONFIGFILE))
	else:
	print("ERROR: Could not find any conditions (\"data:\{condition\}\") in configfile {0}".format(CONFIGFILE))
	sys.exit()

	#-------------------------------------------------------------------------------#
	#------------------------- WHICH FILES/INFO WERE INPUT? ------------------------#
	#-------------------------------------------------------------------------------#

	input_files = []

	#Files related to experimental data (bam)
	CONDITION_IDS = list(config["data"].keys())
	for condition in CONDITION_IDS:
	if not isinstance(config["data"][condition], list):
	config['data'][condition] = [config['data'][condition]]
	config["data"][condition] = sum([glob.glob(f) for f in config["data"][condition]], []) #make flat list
	config["data"][condition] = list(set(config["data"][condition])) #remove duplicates
	input_files.extend(config['data'][condition])

	#Flatfiles independent from experimental data (run_info)
	FASTA = config['run_info']['fasta']
	BLACKLIST = config['run_info']['blacklist']
	GTF = config['run_info']['gtf']
	OUTPUTDIR = config['run_info']["output"]
	BLACKLIST = config['run_info']['blacklist']
	#MOTIFS = config['run_info']['motifs']

	input_files.extend([FASTA, BLACKLIST, GTF])

	#---------- Test that input files exist -----------#
	for file in input_files:
	if file != None:
	full_path = os.path.abspath(file)
	if not os.path.exists(full_path):
	exit("ERROR: The following file given in config does not exist: {0}".format(full_path))


	#--------------------------------- MOTIFS ------------------------------#

	#If not list, make it list and glob elements
	if not isinstance(config['run_info']['motifs'], list):
	config['run_info']['motifs'] = [config['run_info']['motifs']]
	motif_input = sum([glob.glob(element) for element in config['run_info']['motifs']], [])

	#Test if input is directory or file
	motif_files = []
	for path in motif_input:

	#If input is dir; fetch all input files
	if os.path.isdir(path):
	files = os.listdir(path)
	motif_files.extend([os.path.join(path, f) for f in files])

	#If input is file, add to list of files
	elif os.path.isfile(path):
	motif_files.append(path)

	motif_files = list(set(motif_files)) #remove duplicates
	config['run_info']['motifs'] = sorted(motif_files)

	#Identify IDS of motifs
	MOTIF_FILES = {}
	for file in motif_files:
	full_file = file
	with open(full_file) as f:
	for line in f:
	if line.startswith("MOTIF"):
	columns = line.rstrip().split()
	ID = columns[2] + "_" + columns[1]
	ID = filafy(ID)
	elif line.startswith(">"):
	columns = line.replace(">", "").rstrip().split()
	ID = columns[1] + "_" + columns[0]
	ID = filafy(ID)
	MOTIF_FILES[ID] = full_file

	TF_IDS = list(MOTIF_FILES.keys())

	#-------------------------------------------------------------------------------#
	#------------------------ WHICH FILES SHOULD BE CREATED? -----------------------#
	#-------------------------------------------------------------------------------#

	output_files = []

	id2bam = {condition:{} for condition in CONDITION_IDS}
	for condition in CONDITION_IDS:
	config_bams = config['data'][condition]
	sampleids = [os.path.splitext(os.path.basename(bam))[0] for bam in config_bams]
	id2bam[condition] = {sampleids[i]:config_bams[i] for i in range(len(sampleids))} # Link sample ids to bams

	PLOTNAMES = expand("{condition}_{plotname}", condition=CONDITION_IDS, plotname=["aggregate"])
	if len(CONDITION_IDS) > 1:
	PLOTNAMES.extend(["heatmap_comparison", "aggregate_comparison_all", "aggregate_comparison_bound"])

	output_files.append(os.path.join(OUTPUTDIR, "config.yaml"))
	output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS))

	#output_files.append(os.path.join(OUTPUTDIR, "TFBS", "bindetect_results.txt"))
	#output_files.append(os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt"))
	output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed"), condition=CONDITION_IDS))

	#Visualization
	output_files.extend(expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_{plotname}.pdf"), TF=TF_IDS, plotname=PLOTNAMES))
	output_files.extend(expand(os.path.join(OUTPUTDIR, "overview", "all_{plotname}.pdf"), plotname=PLOTNAMES))
	output_files.append(os.path.join(OUTPUTDIR, "overview", "TF_changes.pdf"))

	#Wilson
	output_files.extend(expand(os.path.join(OUTPUTDIR, "wilson", "data", "{TF}_overview.clarion"), TF=TF_IDS))
	output_files.append(os.path.join(OUTPUTDIR, "wilson", "HOW_TO_WILSON.txt"))

	#-------------------------------------------------------------------------------#
	#------------------------ DEAL WITH SPECIAL ENVIRONMENTS -----------------------#
	#-------------------------------------------------------------------------------#

	sys_env = subprocess.check_output(['conda', 'env', 'list'], universal_newlines=True)
	env_list = [line.split()[0] for line in sys_env.split("\n") if len(line.split()) > 0]

	# default TOBIAS environment
	if "TOBIAS_ENV" not in env_list:
	print("Creating TOBIAS environment for the first time")
	subprocess.call(["conda", "env", "create", "--file", "environments/tobias.yaml"])

	# python 2 related envs
	if "MACS_ENV" not in env_list:
	print("Creating macs environment for the first time")
	subprocess.call(["conda", "env", "create", "--file", "environments/macs.yaml"])


	#-------------------------------------------------------------------------------#
	#---------------------------------- RUN :-) ------------------------------------#
	#-------------------------------------------------------------------------------#

	include: "snakefiles/preprocessing.snake"
	include: "snakefiles/footprinting.snake"
	include: "snakefiles/visualization.snake"
	include: "snakefiles/wilson.snake"

	rule all:
	input:
	output_files
	message: "Rule all"