From 5ccee944f2628c62634ec3c0fc50a5f54bebc5e0 Mon Sep 17 00:00:00 2001
From: msbentsen <mette.bentsen@mpi-bn.mpg.de>
Date: Wed, 5 Feb 2020 12:34:30 +0100
Subject: [PATCH] Added filtering of peaks not in input fasta

---
 snakefiles/footprinting.snake  | 21 ++++++++++++++-------
 snakefiles/preprocessing.snake | 23 ++++++++++++++++++-----
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/snakefiles/footprinting.snake b/snakefiles/footprinting.snake
index de71c2d..fce7140 100644
--- a/snakefiles/footprinting.snake
+++ b/snakefiles/footprinting.snake
@@ -21,7 +21,8 @@ rule format_motifs:
 		rules.copy_motifs.output 	#this is a directory 
 	output:
 		joined = os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt") 
-	priority: 2
+	priority: 
+		2
 	params:
 		motifdir = os.path.join(OUTPUTDIR, "motifs", "individual")
 	log: 
@@ -46,8 +47,10 @@ rule atacorrect:
 		"--outdir " + os.path.join(OUTPUTDIR, "bias_correction"),
 		"--prefix " + "{condition}",
 		config.get("atacorrect", "")
-	priority: 2
-	threads: 99 	#unless there are more than 99 cores, this rule will run on max threads
+	priority: 
+		2
+	threads: 
+		99 	#unless there are more than 99 cores, this rule will run on max threads
 	log: 
 		os.path.join(OUTPUTDIR, "logs", "{condition}_atacorrect.log")
 	message: 
@@ -66,8 +69,10 @@ rule footprinting:
 		footprints = os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"),
 	params:
 		config.get("footprinting", "")
-	priority: 2
-	threads: 99
+	priority: 
+		2
+	threads: 
+		99
 	log:
 		os.path.join(OUTPUTDIR, "logs", "{condition}_footprinting.log")
 	message:
@@ -129,8 +134,10 @@ rule join_bound:
 	output:
 		unsorted = temp(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.tmp")),
 		final = os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed")
-	priority: 2
-	threads: 99	#Make sure that only one join_bound runs at a time; due to memory issues
+	priority: 
+		2
+	threads: 
+		99	#Make sure that only one join_bound runs at a time; due to memory issues
 	run:
 		n = 100 	#chunks of 100 files
 		
diff --git a/snakefiles/preprocessing.snake b/snakefiles/preprocessing.snake
index ab9dda8..1499f19 100644
--- a/snakefiles/preprocessing.snake
+++ b/snakefiles/preprocessing.snake
@@ -16,6 +16,16 @@ rule copy_flatfiles:
 		"cp {input.blacklist} {output.blacklist};"
 		"cp {input.gtf} {output.gtf}"
 
+#Get chromosomes available in fasta
+rule get_fasta_chroms:
+	input:
+		rules.copy_flatfiles.output.fasta
+	output:
+		txt = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.txt"),
+		bed = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.bed")
+	shell:
+		"faidx {input} -i chromsizes > {output.txt};"
+		"awk '{{ print $1\"\t\"0\"\t\"$2 }}' {output.txt} > {output.bed}"
 
 # ------------------------------------------------------------------------------------------------------------------ #
 # ---------------------------------------------- Config processing ------------------------------------------------- #
@@ -98,13 +108,16 @@ rule macs:
 rule process_peaks:
 	input: 
 		peaks = lambda wildcards: [os.path.join(OUTPUTDIR, "peak_calling", wildcards.condition, sample_id + "_raw.bed") for sample_id in id2bam[wildcards.condition].keys()],
-		blacklisted = BLACKLIST
+		blacklist = BLACKLIST,
+		whitelist = rules.get_fasta_chroms.output.bed
 	output: 
 		peaks = os.path.join(OUTPUTDIR, "peak_calling", "{condition}_union.bed")
 	message: "Processing peaks from condition {wildcards.condition}"
 	shell:
-		"cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | bedtools subtract -a - -b {input.blacklisted} -A | "
-		"awk '$1 !~ /[M]/' | "
+		"cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | "
+		"bedtools subtract -a - -b {input.blacklist} -A | "
+		"bedtools intersect -a - -b {input.whitelist} -wa | "
+		"awk '$1 !~ /[M]/' | "	#exclude chromosome M
 		"awk '{{print $s\"\\t{wildcards.condition}\"}}' > {output.peaks}"	#add condition name to each peak
 
 # Union peaks across all conditions
@@ -118,10 +131,10 @@ rule merge_condition_peaks:
 	shell:
 		"cat {input} | sort -k1,1 -k2,2n | bedtools merge -d 5 -c 4 -o distinct > {output}"
 
-#Get correct sorting of peak_names for 
+#Get correct sorting of peak_names
 rule sort_peak_names:
 	input: 
-		rules.merge_condition_peaks.output 		#os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp")
+		rules.merge_condition_peaks.output
 	output:
 		peaks = os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed")
 	run: