From 5ccee944f2628c62634ec3c0fc50a5f54bebc5e0 Mon Sep 17 00:00:00 2001 From: msbentsen Date: Wed, 5 Feb 2020 12:34:30 +0100 Subject: [PATCH] Added filtering of peaks not in input fasta --- snakefiles/footprinting.snake | 21 ++++++++++++++------- snakefiles/preprocessing.snake | 23 ++++++++++++++++++----- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/snakefiles/footprinting.snake b/snakefiles/footprinting.snake index de71c2d..fce7140 100644 --- a/snakefiles/footprinting.snake +++ b/snakefiles/footprinting.snake @@ -21,7 +21,8 @@ rule format_motifs: rules.copy_motifs.output #this is a directory output: joined = os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt") - priority: 2 + priority: + 2 params: motifdir = os.path.join(OUTPUTDIR, "motifs", "individual") log: @@ -46,8 +47,10 @@ rule atacorrect: "--outdir " + os.path.join(OUTPUTDIR, "bias_correction"), "--prefix " + "{condition}", config.get("atacorrect", "") - priority: 2 - threads: 99 #unless there are more than 99 cores, this rule will run on max threads + priority: + 2 + threads: + 99 #unless there are more than 99 cores, this rule will run on max threads log: os.path.join(OUTPUTDIR, "logs", "{condition}_atacorrect.log") message: @@ -66,8 +69,10 @@ rule footprinting: footprints = os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), params: config.get("footprinting", "") - priority: 2 - threads: 99 + priority: + 2 + threads: + 99 log: os.path.join(OUTPUTDIR, "logs", "{condition}_footprinting.log") message: @@ -129,8 +134,10 @@ rule join_bound: output: unsorted = temp(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.tmp")), final = os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed") - priority: 2 - threads: 99 #Make sure that only one join_bound runs at a time; due to memory issues + priority: + 2 + threads: + 99 #Make sure that only one join_bound runs at a time; due to memory issues run: n = 100 #chunks of 100 files diff --git a/snakefiles/preprocessing.snake b/snakefiles/preprocessing.snake index ab9dda8..1499f19 100644 --- a/snakefiles/preprocessing.snake +++ b/snakefiles/preprocessing.snake @@ -16,6 +16,16 @@ rule copy_flatfiles: "cp {input.blacklist} {output.blacklist};" "cp {input.gtf} {output.gtf}" +#Get chromosomes available in fasta +rule get_fasta_chroms: + input: + rules.copy_flatfiles.output.fasta + output: + txt = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.txt"), + bed = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.bed") + shell: + "faidx {input} -i chromsizes > {output.txt};" + "awk '{{ print $1\"\t\"0\"\t\"$2 }}' {output.txt} > {output.bed}" # ------------------------------------------------------------------------------------------------------------------ # # ---------------------------------------------- Config processing ------------------------------------------------- # @@ -98,13 +108,16 @@ rule macs: rule process_peaks: input: peaks = lambda wildcards: [os.path.join(OUTPUTDIR, "peak_calling", wildcards.condition, sample_id + "_raw.bed") for sample_id in id2bam[wildcards.condition].keys()], - blacklisted = BLACKLIST + blacklist = BLACKLIST, + whitelist = rules.get_fasta_chroms.output.bed output: peaks = os.path.join(OUTPUTDIR, "peak_calling", "{condition}_union.bed") message: "Processing peaks from condition {wildcards.condition}" shell: - "cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | bedtools subtract -a - -b {input.blacklisted} -A | " - "awk '$1 !~ /[M]/' | " + "cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | " + "bedtools subtract -a - -b {input.blacklist} -A | " + "bedtools intersect -a - -b {input.whitelist} -wa | " + "awk '$1 !~ /[M]/' | " #exclude chromosome M "awk '{{print $s\"\\t{wildcards.condition}\"}}' > {output.peaks}" #add condition name to each peak # Union peaks across all conditions @@ -118,10 +131,10 @@ rule merge_condition_peaks: shell: "cat {input} | sort -k1,1 -k2,2n | bedtools merge -d 5 -c 4 -o distinct > {output}" -#Get correct sorting of peak_names for +#Get correct sorting of peak_names rule sort_peak_names: input: - rules.merge_condition_peaks.output #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp") + rules.merge_condition_peaks.output output: peaks = os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed") run: