footprinting.snake

# vim: syntax=python tabstop=4 expandtab
# coding: utf-8

#--------------------------------------------------------------------------------------------------------#

#Copy motif files to folder
rule copy_motifs:
	input:
		config["run_info"]["motifs"] 	#list of files
	output:
		[os.path.join(OUTPUTDIR, "motifs", "individual", os.path.basename(f)) for f in config["run_info"]["motifs"]]
	params:
		outdir = os.path.join(OUTPUTDIR, "motifs", "individual")
	run:
		for f in input:
			shell("cp {0} {1}".format(f, params.outdir))

#Format motifs to one file
rule format_motifs:
	input:
		rules.copy_motifs.output 	#this is a directory
	output:
		joined = os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt")
	priority:
		2
	params:
		motifdir = os.path.join(OUTPUTDIR, "motifs", "individual")
	log:
		os.path.join(OUTPUTDIR, "logs", "format_motifs.log")
	shell:
		"TOBIAS FormatMotifs --input {params.motifdir} --format pfm --task join --output {output} &>> {log}"

#--------------------------------------------------------------------------------------------------------#
#Correct reads for Tn5 sequence bias
rule atacorrect:
	input:
		bam = rules.conditionbam.output.bam, 	#os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"),
		peaks = rules.sort_peak_names.output, 	#os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"),
		genome = FASTA
	output:
		uncorrected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_uncorrected.bw"),
		bias = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_bias.bw"),
		expected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_expected.bw"),
		corrected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_corrected.bw"),
	params:
		"--blacklist " + BLACKLIST if BLACKLIST != "" else "",
		"--outdir " + os.path.join(OUTPUTDIR, "bias_correction"),
		"--prefix " + "{condition}",
		config.get("atacorrect", "")
	priority:
		2
	threads:
		99 	#unless there are more than 99 cores, this rule will run on max threads
	log:
		os.path.join(OUTPUTDIR, "logs", "{condition}_atacorrect.log")
	message:
		"Running ATACorrect for condition {wildcards.condition} ({input.bam})"
	shell:
		"TOBIAS ATACorrect -b {input.bam} -g {input.genome} -p {input.peaks} --cores {threads} {params} &> {log}"


#--------------------------------------------------------------------------------------------------------#
#Calculate footprint scores per condition
rule footprinting:
	input:
		signal = rules.atacorrect.output.corrected, 	#os.path.join(OUTPUTDIR, "bias_correction", "{condition}_corrected.bw"),
		regions = rules.sort_peak_names.output		#os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed")
	output:
		footprints = os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"),
	params:
		config.get("footprinting", "")
	priority:
		2
	threads:
		99
	log:
		os.path.join(OUTPUTDIR, "logs", "{condition}_footprinting.log")
	message:
		"Running footprinting for condition {wildcards.condition} ({input.signal})"
	shell:
		"TOBIAS FootprintScores --signal {input.signal} --regions {input.regions} --output {output.footprints} --cores {threads} {params} &> {log}"


#--------------------------------------------------------------------------------------------------------#
#Estimate bound sites from scored file
checkpoint bindetect:
	input:
		motifs = rules.format_motifs.output, 		#os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt"),
		footprints = expand(rules.footprinting.output.footprints, condition=CONDITION_IDS), #os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS),
		genome = FASTA,
		peaks = rules.uropa.output.peaks,			#os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"),
		peak_header = rules.uropa.output.header 	#os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt")
	output:
		directory(os.path.join(OUTPUTDIR, "TFBS"))
		#os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt")
		#bound_files = dynamic(expand(os.path.join(OUTPUTDIR, "TFBS", "{{TF}}", "beds", "{{TF}}_{condition}_bound.bed"), condition=CONDITION_IDS)), #, TF={TF}"),
		#unbound_files = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_{condition}_unbound.bed"), condition=CONDITION_IDS, TF=TF_IDS),
		#all_files = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_all.bed"), TF=TF_IDS),
		#overview = dynamic(expand(os.path.join(OUTPUTDIR, "TFBS", "{{TF}}", "{{TF}}_overview.txt"))),
		#figures = os.path.join(OUTPUTDIR, "TFBS", "bindetect_figures.pdf"),
		#local = [os.path.join(OUTPUTDIR, "TFBS", fil) for fil in ["bindetect_results.txt", "bindetect_results.xlsx"]],
	threads: 99
	priority: 2
	log:
		os.path.join(OUTPUTDIR, "logs", "bindetect.log")
	params:
		config.get("bindetect", ""),
		"--cond_names " + " ".join(CONDITION_IDS),
	message:
		"Running BINDetect"
	shell:
		"TOBIAS BINDetect --motifs {input.motifs} --signals {input.footprints} --genome {input.genome} "
		"--peaks {input.peaks} --peak_header {input.peak_header} --cores {threads} --outdir {output} {params} &> {log}; "

		"mkdir -p " + os.path.join(OUTPUTDIR, "overview") + ";"
		"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.txt") + " " + os.path.join(OUTPUTDIR, "overview") + ";"	#move files to overview
		"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.xlsx") + " " + os.path.join(OUTPUTDIR, "overview") + ";"
		"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.pdf") + " " + os.path.join(OUTPUTDIR, "overview") + ";"


#--------------------------------------------------------------------------------------------------------#

def get_TF_ids(wildcards):
	bindetect_output = checkpoints.bindetect.get(**wildcards).output[0]
	TF_IDS = glob_wildcards(os.path.join(bindetect_output, "{TF}", "beds")).TF	#wildcard for each TF from dir name
	return(TF_IDS)

#--------------------------------------------------------------------------------------------------------#

#Join bound estimates per condition
rule join_bound:
	input:
		lambda wildcards: expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_{{condition}}_bound.bed"), TF=get_TF_ids(wildcards)),
	output:
		unsorted = temp(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.tmp")),
		final = os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed")
	priority:
		2
	threads:
		99	#Make sure that only one join_bound runs at a time; due to memory issues
	run:
		n = 100 	#chunks of 100 files

		shell("> " + output[0])
		file_chunks = [input[i:i+n] for i in range(0,len(input),n)]
		for chunk in file_chunks:
			shell("cat {0} | bedtools sort >> {1}".format(" ".join(chunk), output.unsorted))
		shell("sort -k 1,1 -k2,2 -n {output.unsorted} > {output.final}")
		shell("igvtools index {output.final}")
	# vim: syntax=python tabstop=4 expandtab
	# coding: utf-8

	#--------------------------------------------------------------------------------------------------------#

	#Copy motif files to folder
	rule copy_motifs:
	input:
	config["run_info"]["motifs"] #list of files
	output:
	[os.path.join(OUTPUTDIR, "motifs", "individual", os.path.basename(f)) for f in config["run_info"]["motifs"]]
	params:
	outdir = os.path.join(OUTPUTDIR, "motifs", "individual")
	run:
	for f in input:
	shell("cp {0} {1}".format(f, params.outdir))

	#Format motifs to one file
	rule format_motifs:
	input:
	rules.copy_motifs.output #this is a directory
	output:
	joined = os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt")
	priority:
	2
	params:
	motifdir = os.path.join(OUTPUTDIR, "motifs", "individual")
	log:
	os.path.join(OUTPUTDIR, "logs", "format_motifs.log")
	shell:
	"TOBIAS FormatMotifs --input {params.motifdir} --format pfm --task join --output {output} &>> {log}"

	#--------------------------------------------------------------------------------------------------------#
	#Correct reads for Tn5 sequence bias
	rule atacorrect:
	input:
	bam = rules.conditionbam.output.bam, #os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"),
	peaks = rules.sort_peak_names.output, #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"),
	genome = FASTA
	output:
	uncorrected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_uncorrected.bw"),
	bias = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_bias.bw"),
	expected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_expected.bw"),
	corrected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_corrected.bw"),
	params:
	"--blacklist " + BLACKLIST if BLACKLIST != "" else "",
	"--outdir " + os.path.join(OUTPUTDIR, "bias_correction"),
	"--prefix " + "{condition}",
	config.get("atacorrect", "")
	priority:
	2
	threads:
	99 #unless there are more than 99 cores, this rule will run on max threads
	log:
	os.path.join(OUTPUTDIR, "logs", "{condition}_atacorrect.log")
	message:
	"Running ATACorrect for condition {wildcards.condition} ({input.bam})"
	shell:
	"TOBIAS ATACorrect -b {input.bam} -g {input.genome} -p {input.peaks} --cores {threads} {params} &> {log}"


	#--------------------------------------------------------------------------------------------------------#
	#Calculate footprint scores per condition
	rule footprinting:
	input:
	signal = rules.atacorrect.output.corrected, #os.path.join(OUTPUTDIR, "bias_correction", "{condition}_corrected.bw"),
	regions = rules.sort_peak_names.output #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed")
	output:
	footprints = os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"),
	params:
	config.get("footprinting", "")
	priority:
	2
	threads:
	99
	log:
	os.path.join(OUTPUTDIR, "logs", "{condition}_footprinting.log")
	message:
	"Running footprinting for condition {wildcards.condition} ({input.signal})"
	shell:
	"TOBIAS FootprintScores --signal {input.signal} --regions {input.regions} --output {output.footprints} --cores {threads} {params} &> {log}"


	#--------------------------------------------------------------------------------------------------------#
	#Estimate bound sites from scored file
	checkpoint bindetect:
	input:
	motifs = rules.format_motifs.output, #os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt"),
	footprints = expand(rules.footprinting.output.footprints, condition=CONDITION_IDS), #os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS),
	genome = FASTA,
	peaks = rules.uropa.output.peaks, #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"),
	peak_header = rules.uropa.output.header #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt")
	output:
	directory(os.path.join(OUTPUTDIR, "TFBS"))
	#os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt")
	#bound_files = dynamic(expand(os.path.join(OUTPUTDIR, "TFBS", "{{TF}}", "beds", "{{TF}}_{condition}_bound.bed"), condition=CONDITION_IDS)), #, TF={TF}"),
	#unbound_files = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_{condition}_unbound.bed"), condition=CONDITION_IDS, TF=TF_IDS),
	#all_files = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_all.bed"), TF=TF_IDS),
	#overview = dynamic(expand(os.path.join(OUTPUTDIR, "TFBS", "{{TF}}", "{{TF}}_overview.txt"))),
	#figures = os.path.join(OUTPUTDIR, "TFBS", "bindetect_figures.pdf"),
	#local = [os.path.join(OUTPUTDIR, "TFBS", fil) for fil in ["bindetect_results.txt", "bindetect_results.xlsx"]],
	threads: 99
	priority: 2
	log:
	os.path.join(OUTPUTDIR, "logs", "bindetect.log")
	params:
	config.get("bindetect", ""),
	"--cond_names " + " ".join(CONDITION_IDS),
	message:
	"Running BINDetect"
	shell:
	"TOBIAS BINDetect --motifs {input.motifs} --signals {input.footprints} --genome {input.genome} "
	"--peaks {input.peaks} --peak_header {input.peak_header} --cores {threads} --outdir {output} {params} &> {log}; "

	"mkdir -p " + os.path.join(OUTPUTDIR, "overview") + ";"
	"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.txt") + " " + os.path.join(OUTPUTDIR, "overview") + ";" #move files to overview
	"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.xlsx") + " " + os.path.join(OUTPUTDIR, "overview") + ";"
	"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.pdf") + " " + os.path.join(OUTPUTDIR, "overview") + ";"


	#--------------------------------------------------------------------------------------------------------#

	def get_TF_ids(wildcards):
	bindetect_output = checkpoints.bindetect.get(**wildcards).output[0]
	TF_IDS = glob_wildcards(os.path.join(bindetect_output, "{TF}", "beds")).TF #wildcard for each TF from dir name
	return(TF_IDS)

	#--------------------------------------------------------------------------------------------------------#

	#Join bound estimates per condition
	rule join_bound:
	input:
	lambda wildcards: expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_{{condition}}_bound.bed"), TF=get_TF_ids(wildcards)),
	output:
	unsorted = temp(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.tmp")),
	final = os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed")
	priority:
	2
	threads:
	99 #Make sure that only one join_bound runs at a time; due to memory issues
	run:
	n = 100 #chunks of 100 files

	shell("> " + output[0])
	file_chunks = [input[i:i+n] for i in range(0,len(input),n)]
	for chunk in file_chunks:
	shell("cat {0} \| bedtools sort >> {1}".format(" ".join(chunk), output.unsorted))
	shell("sort -k 1,1 -k2,2 -n {output.unsorted} > {output.final}")
	shell("igvtools index {output.final}")