From 16110f50709239705bd36e76f31575bed37f1eea Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:28:07 +0100 Subject: [PATCH 01/25] Add files via upload --- compareBed.sh | 183 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 compareBed.sh diff --git a/compareBed.sh b/compareBed.sh new file mode 100644 index 0000000..8e81434 --- /dev/null +++ b/compareBed.sh @@ -0,0 +1,183 @@ +#!/bin/bash +#data +#motifs +#workdir +#fasta +#min +#max +#output +wrong=() +da=false +mo=false +wo=false +fa=false +mi=false +ma=false +ou=false +he=false + +if [ $# -eq 0 ] +then + he=true +fi + +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + -d|--data) + data="$2" + da=true + shift + shift + ;; + -m|--motifs) + motifs="$2" + mo=true + shift + shift + ;; + -w|--workdir) + workdir="$2" + wo=true + shift + shift + ;; + -f|--fasta) + fasta="$2" + fa=true + shift + shift + ;; + -min|--min) + min=$2 + mi=true + shift + shift + ;; + -max|--max) + max=$2 + ma=true + shift + shift + ;; + -o|--output) + output="$2" + ou=true + shift + shift + ;; + -h|--help) + help=true + he=true + shift + ;; + *) + wrong+=("$1") + shift + ;; +esac +done + +count=${#wrong[@]} +if [ $count -gt 0 ] +then + for i in ${wrong[@]} + do + echo wrong parameter $i + echo call script without parameters for help or call --help + echo exit + done +exit 1 +fi + +if [ $he == true ] +then + echo "This script utilies bedtools to select new footprints from data." + echo "Therefore the data is compared with known footprint motifs." + echo "The output is a new .bed file with added sequence information." + echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." + echo "If a parameter is chosen, a value must be provided or an error will occur." + echo "--------------------" + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." + echo "--------------------" + echo " required parameter:" + echo " -d --data the path to the .bed file of the footprints" + echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -f --fasta the path to the .fasta file of genome" + echo " " + echo " optional parameter:" + echo " -w --workdir the path to directory where temporary files will be created" + echo " default is a new directory that is created in the current directory" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 60" + echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" + echo " -h --help shows this help message" +exit 0 +fi + +echo selected parameters +echo ------------------- +echo data: $da \(required\) +echo motifs: $mo \(required\) +echo workdir: $wo +echo fasta: $fa \(required\) +echo min: $mi +echo max: $ma +echo output: $ou +echo help: $he + +if [ $da == false ] || [ $mo == false ] || [ $fa == false ] +then + echo required parameters not given. + echo required are: --data \ --motifs \ --fasta \ + exit 1 +fi + +if [ $wo == false ] +then + if [ ! -d workdir1337 ] + then + mkdir workdir1337 + fi + wo=true + workdir="workdir1337" +fi + +if [ $ou == false ] +then + output="newFootprints.bed" + ou=true +fi + +if [ $mi == false ] +then + min=10 + mi=true +fi + +if [ $ma == false ] +then + max=60 + ma=true +fi +#1. first filter. no overlap vs. overlap +bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed +bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed + +#2. compute absolut maxscore position +Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed + +#3. subtract overlapping regions for additional motifs +bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed + +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +Rscript --vanilla merge.R $min $max "$workdir" + +#5. add fasta sequences to bed and create fasta file +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" +bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta + +#6 clean up +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file From a9d0070e3141f273360608fa4d2cdd7264f75684 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:30:56 +0100 Subject: [PATCH 02/25] Add files via upload --- maxScore.R | 9 +++++++++ merge.R | 23 +++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 maxScore.R create mode 100644 merge.R diff --git a/maxScore.R b/maxScore.R new file mode 100644 index 0000000..4ea545b --- /dev/null +++ b/maxScore.R @@ -0,0 +1,9 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +args = commandArgs(TRUE) +file = args[1] + +tab = read.table(file, header=FALSE) +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +tab$maxpos = tab$start + tab$maxpos + +write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/merge.R b/merge.R new file mode 100644 index 0000000..bbe3c9a --- /dev/null +++ b/merge.R @@ -0,0 +1,23 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +args=commandArgs(TRUE) +min=as.numeric(args[1]) +max=as.numeric(args[2]) +folder=args[3] +splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +p1$maxpos = p1$start + p1$maxpos + +splitted=rbind(splitted, p1) + +splitted=splitted[which(splitted$stop - splitted$start >= min),] +splitted=splitted[which(splitted$stop - splitted$start <= max),] +splitted$id=make.unique(as.character(splitted$id)) +splitted$length=splitted$stop - splitted$start + +splitted=cbind(splitted, containsMaxpos=0) +splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 +splitted$maxpos = splitted$maxpos - splitted$start +write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') + From 26d276428a2d34c56ec135a7cc226710293081f7 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:49:28 +0100 Subject: [PATCH 03/25] Update README.md --- README.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 811fa89..fcb5bb2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ -# masterJLU2018 +# compareBed.sh for filtering of new, yet unannotated motifs -de novo motif discovery and evaluation based on footprints identified by TOBIAS +The script mainly utilizes bedtools for comparison. +The sequences in the bed-file from the experiment are compared to a bed-file of sequences of known motifs. +Only sequences with no overlap with the known-motif sequences are selected in the first step. +Then the overlapping parts of overlapping sequences are also selected as new sequences. +The resulting bed-file contains the sequences selected in the two steps. The file has two extra columns appended: 1st the DNA sequence of the entry and 2nd a flag with possible values '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. + +Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. + +For usage, run ./compareBed.sh + +dependencies: +- R +- From 6fd9948f4e81d7180033ec7ada7630efe6e3d6c4 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Tue, 20 Nov 2018 15:57:22 +0100 Subject: [PATCH 04/25] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fcb5bb2..4dc88e2 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ # compareBed.sh for filtering of new, yet unannotated motifs The script mainly utilizes bedtools for comparison. -The sequences in the bed-file from the experiment are compared to a bed-file of sequences of known motifs. +The sequences in the bed-file of the experiment are compared with sequences of known motifs, also in bed-format. Only sequences with no overlap with the known-motif sequences are selected in the first step. Then the overlapping parts of overlapping sequences are also selected as new sequences. -The resulting bed-file contains the sequences selected in the two steps. The file has two extra columns appended: 1st the DNA sequence of the entry and 2nd a flag with possible values '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. +The resulting bed-file contains the sequences selected in both steps. The output file has two extra columns appended: 1st, the DNA sequence of the entry, and 2nd, a flag with possible values of '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. -Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. +Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. The bed-file with known motif sequences can be calculated with the tool motifscan.py. For usage, run ./compareBed.sh dependencies: -- R -- +- R ?version +- bedtools ?version From 2fdbf0da858b2fd20c10511c2de052c9db10bcdb Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 28 Nov 2018 15:34:45 +0100 Subject: [PATCH 05/25] update, working with unmerged motif files now --- compareBed.sh | 84 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 8 deletions(-) diff --git a/compareBed.sh b/compareBed.sh index 8e81434..92c55d7 100644 --- a/compareBed.sh +++ b/compareBed.sh @@ -104,14 +104,14 @@ then echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -m --motifs the path to the directory where the .bed files of the scanned motifs are stored" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" echo " -w --workdir the path to directory where temporary files will be created" echo " default is a new directory that is created in the current directory" echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 60" + echo " -max --max maximum size of footprints; default is 80" echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" echo " -h --help shows this help message" exit 0 @@ -159,18 +159,65 @@ fi if [ $ma == false ] then - max=60 + max=80 ma=true fi + +if [ ! -d $workdir ] +then + mkdir $workdir +fi + #1. first filter. no overlap vs. overlap -bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed +echo get sequences with no overlap +cat $data > "$workdir"/pass1Tr.bed +help=true +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +if [ $help == false ] +then + cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +fi +echo get overlapping sequences +bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed +cat "$workdir"/pass1Fa.bed | wc -l +echo calc maxScore #2. compute absolut maxscore position Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed -#3. subtract overlapping regions for additional motifs -bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed +#3 subtract overlapping regions for additional motifs +echo "also get subsequences with no overlap of overlapping sequences" +help=true +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done + +if [ $help == false ] +then + cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed +else + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed +fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition Rscript --vanilla merge.R $min $max "$workdir" @@ -180,4 +227,25 @@ bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$o bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed + + +#1. first filter. no overlap vs. overlap +#bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed +#bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed + +#2. compute absolut maxscore position +#Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed + +#3. subtract overlapping regions for additional motifs +#bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed + +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +#Rscript --vanilla merge.R $min $max "$workdir" + +#5. add fasta sequences to bed and create fasta file +#bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" +#bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta + +#6 clean up +#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed From 902882004278c0029ff9c15b6a0dd1761e2a4256 Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Thu, 29 Nov 2018 13:46:28 +0100 Subject: [PATCH 06/25] update, bugfix output parameter error --- compareBed.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/compareBed.sh b/compareBed.sh index 92c55d7..279c9e3 100644 --- a/compareBed.sh +++ b/compareBed.sh @@ -147,7 +147,7 @@ fi if [ $ou == false ] then - output="newFootprints.bed" + output=${workdir}/"newFootprints.bed" ou=true fi @@ -194,7 +194,7 @@ cat "$workdir"/pass1Fa.bed | wc -l echo calc maxScore #2. compute absolut maxscore position -Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed #3 subtract overlapping regions for additional motifs echo "also get subsequences with no overlap of overlapping sequences" @@ -220,14 +220,14 @@ else fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla merge.R $min $max "$workdir" +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed #1. first filter. no overlap vs. overlap From 3de27010e46c9f704acdc44d7e38a6a7f4a0d3ef Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 5 Dec 2018 12:06:36 +0100 Subject: [PATCH 07/25] Rscript first line changes --- compareBed.sh | 63 ++++++++++++++++++++++++++++++++------------------- maxScore.R | 7 +++--- merge.R | 14 ++++++------ 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/compareBed.sh b/compareBed.sh index 279c9e3..2153b48 100644 --- a/compareBed.sh +++ b/compareBed.sh @@ -104,7 +104,8 @@ then echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the directory where the .bed files of the scanned motifs are stored" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" @@ -172,6 +173,9 @@ fi echo get sequences with no overlap cat $data > "$workdir"/pass1Tr.bed help=true + +if [ -d $"motifs" ] +then for i in "$motifs"/*.bed do if [ $help == true ] @@ -184,6 +188,22 @@ do fi echo $i done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi + if [ $help == false ] then cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed @@ -199,6 +219,9 @@ Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed #3 subtract overlapping regions for additional motifs echo "also get subsequences with no overlap of overlapping sequences" help=true + +if [ -d $"motifs" ] +then for i in "$motifs"/*.bed do if [ $help == true ] @@ -211,6 +234,21 @@ do fi echo $i done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + if [ $help == false ] then @@ -227,25 +265,4 @@ bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed - - -#1. first filter. no overlap vs. overlap -#bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -#bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed - -#2. compute absolut maxscore position -#Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed - -#3. subtract overlapping regions for additional motifs -#bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed - -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -#Rscript --vanilla merge.R $min $max "$workdir" - -#5. add fasta sequences to bed and create fasta file -#bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -#bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta - -#6 clean up -#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/maxScore.R b/maxScore.R index 4ea545b..1dfed4b 100644 --- a/maxScore.R +++ b/maxScore.R @@ -1,9 +1,10 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args = commandArgs(TRUE) file = args[1] -tab = read.table(file, header=FALSE) +tab = fread(file, header=FALSE) colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") tab$maxpos = tab$start + tab$maxpos -write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/merge.R b/merge.R index bbe3c9a..f491cb9 100644 --- a/merge.R +++ b/merge.R @@ -1,12 +1,13 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] -splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -19,5 +20,4 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - +fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 527c4fa2e03686d487095f77387baa332255591d Mon Sep 17 00:00:00 2001 From: JannikHamp Date: Wed, 5 Dec 2018 12:17:38 +0100 Subject: [PATCH 08/25] Delete README.md --- README.md | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 4dc88e2..0000000 --- a/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# compareBed.sh for filtering of new, yet unannotated motifs - -The script mainly utilizes bedtools for comparison. -The sequences in the bed-file of the experiment are compared with sequences of known motifs, also in bed-format. -Only sequences with no overlap with the known-motif sequences are selected in the first step. -Then the overlapping parts of overlapping sequences are also selected as new sequences. -The resulting bed-file contains the sequences selected in both steps. The output file has two extra columns appended: 1st, the DNA sequence of the entry, and 2nd, a flag with possible values of '1' or '0' for 'no_overlap' or 'overlap' of the maximum score region of the entity with any known-motif sequence. - -Apart from the experiment data (bed-format) of sequences with possible motifs, another bed-file of the sequences with known motifs and a genome file in fasta format are required. The bed-file with known motif sequences can be calculated with the tool motifscan.py. - -For usage, run ./compareBed.sh - -dependencies: -- R ?version -- bedtools ?version From f742731fe7d00d840c90c83dbeb06a8fee8c9795 Mon Sep 17 00:00:00 2001 From: basti Date: Wed, 5 Dec 2018 13:11:11 +0100 Subject: [PATCH 09/25] Changed ResultPath --- bin/Modules/SaveResults.py | 4 ++-- bin/RegGTFExtractor.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index a15ffc3..60d01e0 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -9,9 +9,9 @@ def __init__(self, results, organism, tissue, wd): print("Save results to File !") self.path = "" if tissue: - self.path = os.path.join(wd+"/"+organism+"_filtered.gtf") + self.path = os.path.join("./"+organism+"_filtered.gtf") else: - self.path = os.path.join(wd+"/"+organism+".gtf") + self.path = os.path.join("./"+organism+".gtf") with open(self.path, "w") as file: write_it = csv.writer(file, delimiter='\t') diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 0b09b51..bf65106 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -34,9 +34,10 @@ def main_script(org, wd, tissuetype=None): parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) parser.add_argument('--wd', help='Working directory. default: "."', action='store', default='.', type=str) args = vars(parser.parse_args()) + print("Working Dir: " + args["wd"]) if args["organism"]: print("Working Dir: " + args["wd"]) main_script(args["organism"], args["wd"], args["tissue"]) else: - print("No Arguments found -> See ./RegGTFExtractor.py -h for help.") + print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") From f6942a5ac6430fc1620edd9e8246272536285923 Mon Sep 17 00:00:00 2001 From: basti Date: Wed, 5 Dec 2018 13:53:16 +0100 Subject: [PATCH 10/25] Changed ResultPath --- bin/RegGTFExtractor.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index bf65106..49303ee 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -6,6 +6,7 @@ from Modules.Uniquifier import UniqueFilter from Modules.SaveResults import ResultSaver import os +import json def check_for_local_folder(wd): @@ -18,14 +19,38 @@ def check_for_local_folder(wd): os.mkdir(os.path.join(wd+"/UCSCData" )) +def check_filter(tissue_cmd, org, wd): + path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json" ) + tissues_config = [] + if not tissue_cmd: + return False + with open(path_to_config) as input_file: + data = json.loads(input_file.read()) + for x in data: + tissues_config.append(x["type"]) + + if any(tissue in tissues_config for tissue in tissue_cmd): + return True + + else: + return False + + def main_script(org, wd, tissuetype=None): check_for_local_folder(wd) + if check_filter(tissuetype, org, wd): + tissues = tissuetype + print("Filter detected !") + else: + tissues = None + print("Filter not detected !") + ucsc = UcscGtf(org, wd) ense = Ensembl(org, wd) print("Getting Unique Results") - unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype) - ResultSaver(unique_filter.get_results(), org, tissuetype, wd) + unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) + ResultSaver(unique_filter.get_results(), org, tissues, wd) if __name__ == '__main__': From c4b7a0d9d0626a298c6a15d80efdde82934f5bac Mon Sep 17 00:00:00 2001 From: basti Date: Wed, 5 Dec 2018 13:58:30 +0100 Subject: [PATCH 11/25] Changed ResultPath --- bin/Modules/SaveResults.py | 2 +- bin/RegGTFExtractor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index 60d01e0..e13aa6a 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -4,7 +4,7 @@ class ResultSaver: - def __init__(self, results, organism, tissue, wd): + def __init__(self, results, organism, tissue): print("Save results to File !") self.path = "" diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 49303ee..e4657be 100644 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -50,7 +50,7 @@ def main_script(org, wd, tissuetype=None): ense = Ensembl(org, wd) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) - ResultSaver(unique_filter.get_results(), org, tissues, wd) + ResultSaver(unique_filter.get_results(), org, tissues) if __name__ == '__main__': From 014e40da396cfe9d49b5aa9707210f3c77150de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 09:21:50 -0500 Subject: [PATCH 12/25] moved scripts to bin --- bin/compareBed.sh | 117 +++++++++++++++++--- bin/maxScore.R | 7 +- bin/merge.R | 14 +-- compareBed.sh | 268 ---------------------------------------------- maxScore.R | 10 -- merge.R | 23 ---- 6 files changed, 112 insertions(+), 327 deletions(-) delete mode 100644 compareBed.sh delete mode 100644 maxScore.R delete mode 100644 merge.R diff --git a/bin/compareBed.sh b/bin/compareBed.sh index bcf52fb..2153b48 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -100,19 +100,20 @@ then echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." echo "If a parameter is chosen, a value must be provided or an error will occur." echo "--------------------" - echo "usage: compareBed.sh --data \ --motifs \ --fasta \ \[optional_parameter \\] ..." + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" echo " -w --workdir the path to directory where temporary files will be created" echo " default is a new directory that is created in the current directory" - echo " -min --min minimum size of footprints\; default is 10" - echo " -max --max maximum size of footprints\; default is 60" - echo " -o --output output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 80" + echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" echo " -h --help shows this help message" exit 0 fi @@ -147,7 +148,7 @@ fi if [ $ou == false ] then - output="newFootprints.bed" + output=${workdir}/"newFootprints.bed" ou=true fi @@ -159,25 +160,109 @@ fi if [ $ma == false ] then - max=60 + max=80 ma=true fi + +if [ ! -d $workdir ] +then + mkdir $workdir +fi + #1. first filter. no overlap vs. overlap -bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed +echo get sequences with no overlap +cat $data > "$workdir"/pass1Tr.bed +help=true +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi + +if [ $help == false ] +then + cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +fi +echo get overlapping sequences +bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed +cat "$workdir"/pass1Fa.bed | wc -l + +echo calc maxScore #2. compute absolut maxscore position -Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed + +#3 subtract overlapping regions for additional motifs +echo "also get subsequences with no overlap of overlapping sequences" +help=true + +if [ -d $"motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + -#3. subtract overlapping regions for additional motifs -bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed +if [ $help == false ] +then + cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed +else + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed +fi #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla merge.R $min $max "$workdir" +Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/bin/maxScore.R b/bin/maxScore.R index 4ea545b..1dfed4b 100644 --- a/bin/maxScore.R +++ b/bin/maxScore.R @@ -1,9 +1,10 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args = commandArgs(TRUE) file = args[1] -tab = read.table(file, header=FALSE) +tab = fread(file, header=FALSE) colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") tab$maxpos = tab$start + tab$maxpos -write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge.R b/bin/merge.R index bbe3c9a..f491cb9 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -1,12 +1,13 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] -splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -19,5 +20,4 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - +fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/compareBed.sh b/compareBed.sh deleted file mode 100644 index 2153b48..0000000 --- a/compareBed.sh +++ /dev/null @@ -1,268 +0,0 @@ -#!/bin/bash -#data -#motifs -#workdir -#fasta -#min -#max -#output -wrong=() -da=false -mo=false -wo=false -fa=false -mi=false -ma=false -ou=false -he=false - -if [ $# -eq 0 ] -then - he=true -fi - -while [[ $# -gt 0 ]] -do -key="$1" - -case $key in - -d|--data) - data="$2" - da=true - shift - shift - ;; - -m|--motifs) - motifs="$2" - mo=true - shift - shift - ;; - -w|--workdir) - workdir="$2" - wo=true - shift - shift - ;; - -f|--fasta) - fasta="$2" - fa=true - shift - shift - ;; - -min|--min) - min=$2 - mi=true - shift - shift - ;; - -max|--max) - max=$2 - ma=true - shift - shift - ;; - -o|--output) - output="$2" - ou=true - shift - shift - ;; - -h|--help) - help=true - he=true - shift - ;; - *) - wrong+=("$1") - shift - ;; -esac -done - -count=${#wrong[@]} -if [ $count -gt 0 ] -then - for i in ${wrong[@]} - do - echo wrong parameter $i - echo call script without parameters for help or call --help - echo exit - done -exit 1 -fi - -if [ $he == true ] -then - echo "This script utilies bedtools to select new footprints from data." - echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information." - echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." - echo "If a parameter is chosen, a value must be provided or an error will occur." - echo "--------------------" - echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." - echo "--------------------" - echo " required parameter:" - echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" - echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" - echo " -f --fasta the path to the .fasta file of genome" - echo " " - echo " optional parameter:" - echo " -w --workdir the path to directory where temporary files will be created" - echo " default is a new directory that is created in the current directory" - echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 80" - echo " -o --output output path/file ; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" - echo " -h --help shows this help message" -exit 0 -fi - -echo selected parameters -echo ------------------- -echo data: $da \(required\) -echo motifs: $mo \(required\) -echo workdir: $wo -echo fasta: $fa \(required\) -echo min: $mi -echo max: $ma -echo output: $ou -echo help: $he - -if [ $da == false ] || [ $mo == false ] || [ $fa == false ] -then - echo required parameters not given. - echo required are: --data \ --motifs \ --fasta \ - exit 1 -fi - -if [ $wo == false ] -then - if [ ! -d workdir1337 ] - then - mkdir workdir1337 - fi - wo=true - workdir="workdir1337" -fi - -if [ $ou == false ] -then - output=${workdir}/"newFootprints.bed" - ou=true -fi - -if [ $mi == false ] -then - min=10 - mi=true -fi - -if [ $ma == false ] -then - max=80 - ma=true -fi - -if [ ! -d $workdir ] -then - mkdir $workdir -fi - -#1. first filter. no overlap vs. overlap -echo get sequences with no overlap -cat $data > "$workdir"/pass1Tr.bed -help=true - -if [ -d $"motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -else -declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -fi - -if [ $help == false ] -then - cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed -fi -echo get overlapping sequences -bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed -cat "$workdir"/pass1Fa.bed | wc -l - -echo calc maxScore -#2. compute absolut maxscore position -Rscript --vanilla /mnt/agnerds/Jannik.Hamp/maxScore.R "$workdir"/pass1Fa.bed - -#3 subtract overlapping regions for additional motifs -echo "also get subsequences with no overlap of overlapping sequences" -help=true - -if [ -d $"motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -else -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -fi - - -if [ $help == false ] -then - cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed -else - cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed -fi - -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla /mnt/agnerds/Jannik.Hamp/merge.R $min $max "$workdir" - -#5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output" -bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta - -#6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed \ No newline at end of file diff --git a/maxScore.R b/maxScore.R deleted file mode 100644 index 1dfed4b..0000000 --- a/maxScore.R +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/Rscript -library(data.table) -args = commandArgs(TRUE) -file = args[1] - -tab = fread(file, header=FALSE) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -tab$maxpos = tab$start + tab$maxpos - -fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/merge.R b/merge.R deleted file mode 100644 index f491cb9..0000000 --- a/merge.R +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/Rscript -library(data.table) -args=commandArgs(TRUE) -min=as.numeric(args[1]) -max=as.numeric(args[2]) -folder=args[3] -splitted = fread(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -p1 = fread(paste(folder, "/pass1Tr.bed", sep=''), header=TRUE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "bonus_info") -p1$maxpos = p1$start + p1$maxpos - -splitted=rbind(splitted, p1) - -splitted=splitted[which(splitted$stop - splitted$start >= min),] -splitted=splitted[which(splitted$stop - splitted$start <= max),] -splitted$id=make.unique(as.character(splitted$id)) -splitted$length=splitted$stop - splitted$start - -splitted=cbind(splitted, containsMaxpos=0) -splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 -splitted$maxpos = splitted$maxpos - splitted$start -fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') From 6ff051d13f9ffe82759b96786e16c321001206f3 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Wed, 5 Dec 2018 15:22:24 +0100 Subject: [PATCH 13/25] Delete .gitignore --- .gitignore | 168 ----------------------------------------------------- 1 file changed, 168 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index bac574a..0000000 --- a/.gitignore +++ /dev/null @@ -1,168 +0,0 @@ -# Created by .ignore support plugin (hsz.mobi) -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/ - From 02e946ddf46bd6ed92027115117c079c5ea75b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 09:38:23 -0500 Subject: [PATCH 14/25] added parameter to yaml --- masterenv.yml | 7 +++++++ pipeline.nf | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/masterenv.yml b/masterenv.yml index 846cb71..b25f948 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -21,3 +21,10 @@ dependencies: - r-optparse - bioconductor-iranges # + - snakemake + - meme + - moods + - biopython + - pybedtools + - matplotlib + - seaborn diff --git a/pipeline.nf b/pipeline.nf index 5ed2eac..071771a 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -8,6 +8,13 @@ Channel.fromPath(params.config).set {config} bigwig_input.combine(bed_input).set{footprint_in} +//setting default values +params.input="" +params.bed="" +params.genome_fasta="" +params.jaspar_db="" +params.config="" + process footprint_extraction { conda "${path_env}" @@ -40,6 +47,7 @@ process extract_known_TFBS { script: """ + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ./ """ } From 1cdd1c25eb763062f62df934ca31cb3e49115295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 09:44:14 -0500 Subject: [PATCH 15/25] added tfbsscan.py to bin --- bin/tfbsscan.py | 755 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 755 insertions(+) create mode 100644 bin/tfbsscan.py diff --git a/bin/tfbsscan.py b/bin/tfbsscan.py new file mode 100644 index 0000000..2bae662 --- /dev/null +++ b/bin/tfbsscan.py @@ -0,0 +1,755 @@ +""" +TFBSscan.py produces the data to be used to join the footprint and motif information across genome + +@author: Anastasiia Petrova +@contact: anastasiia.petrova(at)mpi-bn.mpg.de + +""" + +import argparse +import sys +import os +import re +import time +import multiprocessing +import logging +import subprocess +from Bio import SeqIO +import Bio.SeqIO.FastaIO as bio +import textwrap +import MOODS.scan +import MOODS.tools +import MOODS.parsers + +logger = logging.getLogger('mytool') +logger.setLevel(logging.INFO) + +formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") + +fh = logging.FileHandler('final_log.txt') +fh.setLevel(logging.INFO) +fh.setFormatter(formatter) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(formatter) +logger.addHandler(ch) + +#catch all the information about input and output files as well as information on the used tool (fimo or moods) +def parse_args(): + parser = argparse.ArgumentParser(prog = 'mytool_ver6', description = textwrap.dedent(''' + + This script takes a list of motifs loaded from jaspar.genereg.net as a combined text file in MEME or .PFM format, a genome file in FASTA format and optionaly a .bed file (the one you want to be merged with the whole genome file) as input. If you want to merge a .bed file with the whole genome file, please enter --bed_file or -b bevor your .bed file. The tool will provide a file output_merge.fa, which you can also use for your research later on. If you already have a merged file, please give this one as genome file input. If there are several motifs in the input file, the tool will create a separate output file for each motif. Choose if you want to use fimo or moods with --use, this script uses by default fimo. Please note that the tool can not provide the calculation of q values with fimo due to the text mode that fimo needs to use. The tool sends merged genome file and motifs to fimo or moods, saves the sorted output for each of the given motifs as moods/fimo_output_[alternate name and id of the motif].txt in the output directory, then calculates the start and the end as real positions on the chromosom and writes this information in the ouput files. The columns in the output file are: chromosom, start, end, the name and score of TF. If a .bed file was given as input, the tool will also add the additional columns from it to the output. If the output file is empty, there were no machtes within given genome regions. Please note, if you want to have all intermediate output files, enter --clean nothing + + '''), epilog='That is what you need to make this script work for you. Enjoy it') + + required_arguments = parser.add_argument_group('required arguments') + + required_arguments.add_argument('-m', '--motifs', help='file in MEME format with mofits loaded from jaspar.genereg.net') + required_arguments.add_argument('-g', '--genome', help='a whole genome file or regions of interest in FASTA format to be scanned with motifs') + + #all other arguments are optional + parser.add_argument('-o', '--output_directory', default='output', const='output', nargs='?', help='output directory, default ./output/') + parser.add_argument('-b', '--bed_file', nargs='?', help='a .bed file to be merged with the whole genome file to find regions of interest') + parser.add_argument('--use', '--use_tool', default='fimo', const='fimo', nargs='?', choices=['fimo', 'moods'], help='choose the tool to work with, default tool is fimo') + parser.add_argument('--clean', nargs='*', choices=['nothing', 'all', 'cut_motifs', 'fimo_output', 'merge_output', 'moods_output'], dest='cleans', help='choose the files you want to delete from the output directory, the default is deleting all the temporary files from the directory') + parser.add_argument('--fimo', help='enter additional options for fimo using = inside "", for example fimo="--norc" to not score the reverse complement DNA strand. By default the --text mode is used and the calculation of the q values due to the --text mode is not possible') + parser.add_argument('--cores', type=int, help='number of cores allowed to use by this tool, by default the tool uses 2 cores', default=2) + parser.add_argument('-p', '--p_value', type=float, help='enter the p value, the default p value is 0.0001. Please note that if you enter the p value using --fimo="--thresh ..." as well, the one within --fimo call will be used', default=0.0001) + parser.add_argument('--resolve_overlaps', action='store_true', help='delete overlaps with greater p value, by default no overlaps are deleted') + parser.add_argument('--hide_info', action='store_true', help='while working with data write the information only into ./log.txt') + parser.add_argument('--moods_bg', nargs='+', type=float, help='set the bg for moods, by default moods uses the bg is 0.25 0.25 0.25 0.25') + + args = parser.parse_args() + return args + +def check_directory(directory): + if not os.path.exists(directory): + os.makedirs(directory) + logger.info('a new directory ' + directory + ' was created') + +#merge the whole genome with the regions mentioned in .bed file +def merge(genome, bed_file, output_directory): + logger.info('the merging of files ' + genome + ' and ' + bed_file + ' will end soon, the result file is output_merge.fa') + output_merge = os.path.join(output_directory, "output_merge.fa") + os.system("bedtools getfasta -fi " + genome + " -bed " + bed_file + " -fo " + output_merge) + return output_merge + +#split the motifs each in other file +def split_motifs(motifs, output_directory, usage): + logger.info("the file with motifs " + motifs + " will be checked for motifs and if needed splitted in files each containing only one motif") + + first_line = subprocess.getoutput("head -1 " + motifs) #find the first line of the input file + + if usage == "moods": + if first_line.startswith(">"): + #the motif file probably has the .pfm format, try to read and split it + splitted_motifs = read_pfm(motifs, output_directory) + else: #maybe the file with motifs is in MEME format, so try to convert it + logger.info("the input file has not the expected format, I will try to convert it to .pfm format") + splitted_motifs = convert_meme_to_pfm(motifs, output_directory) + + elif usage == "fimo": + if first_line.startswith("MEME version"): + #the motifs file has probably the MEME format, try to read and split it + splitted_motifs = read_meme(motifs, output_directory) + + #if the there was a convertion before, delete all the .pfm files as we don't need them + for filename in os.listdir(output_directory): + if filename.endswith(".pfm"): + remove_file(os.path.join(output_directory, filename)) + + else: #maybe the file with motifs is in .pfm format, so try to convert is + logger.info("the input file has not the expected format, I will try to convert it to MEME format") + splitted_motifs = convert_pfm_to_meme(motifs, output_directory) + + return splitted_motifs + +def read_pfm(motifs, output_directory): + splitted_motifs = [] #to save the names of files after splitting + motif = [] #to save the motif itself, which will be written to the file + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + #as the motif has first line with the name and 4 lines with information, if the 5th line is something else than the name of the next motif, the exit will be forced + if lines == 5 and not line.startswith(">"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + else: + if line.startswith(">"): + if 'written_file' in locals(): + written_file.write(''.join(motif)) + motif = [] + lines = 0 + written_file.close() + + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_id = re.split(' ', line[1:])[0] #[1:] meands do not use the first character + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') + + splitted_motifs.append(motif_name) + written_file = open(motif_name, 'w') + + if lines >= 1 and lines <= 4: #one motif has 5 lines, the first consists the name, the next 4 - the information we need to proceed the data within moods + motif.append(line) + + lines = lines + 1 + written_file.write(''.join(motif)) + written_file.close() + + return splitted_motifs + +def read_meme(motifs, output_directory): + splitted_motifs = [] #to save the names of files after splitting + motif = [] #to save the motif itself, which will be written to the file + head = [] #define a list for header, as fimo needs a header in each motif file it proceedes + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + #make the head part + if lines <= 8: + if lines == 0 and not line.startswith("MEME version"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + + head.append(line) + else: + #search for motifs and save each to another file + if line.startswith("MOTIF"): + if 'written_file' in locals(): + written_file.write(''.join(motif)) + motif = [] + written_file.close() + + #the alternate name will be checked for validity and the invalid chars will be replaced with '_' + if len(re.split(' ', line.rstrip())) == 3: #in the input motif file the motif id and the alternate name are splitted using the tab, otherwise they are splitted using _, but we do not want to change it if so + motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) + motif_id = re.split(' ', line)[1] + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.meme') + else: + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_name = os.path.join(output_directory, motif_alternate_name + '.meme') + + #make a list with all the motif names to know which files to iterate when fimo is called + splitted_motifs.append(motif_name) + + written_file = open(motif_name, 'w') + written_file.write(''.join(head)) + + motif.append(line) + + lines = lines + 1 + + #write the last motif + written_file.write(''.join(motif)) + written_file.close() + + read_file.close() + + return splitted_motifs + +def convert_meme_to_pfm(motifs, output_directory): + #i can only convert the file to pfm if the motifs file is in MEME format + + splitted_motifs = [] #to save the names of files after splitting + rows = [[] for row in range(4)] + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + if lines == 0 and not line.startswith("MEME version"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + else: + #search for motifs and save each to another file + if line.startswith("MOTIF"): + + if 'written_file' in locals(): + for row in rows: + written_file.write('\t'.join(row) + '\n') + + rows = [[] for row in range(4)] + + written_file.close() + + #the alternate name will be checked for validity and the invalid chars will be replaced with '_' + if len(re.split(' ', line.rstrip())) == 3: #in the input motif file the motif id and the alternate name are splitted using the tab, otherwise they are splitted using _, but we do not want to change it if so + motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) + motif_id = re.split(' ', line)[1] + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') + + else: + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_name = os.path.join(output_directory, motif_alternate_name + '.pfm') + + #make a list with all the motif names to know which files to iterate when fimo is called + splitted_motifs.append(motif_name) + + written_file = open(motif_name, 'w') + + elif line.startswith("letter-probability matrix"): + columns = int(re.split(' ', re.split('w= ', line)[1])[0]) #find the number of columns from the line out of the motifs file + nsites = int(re.split(' ', re.split('nsites= ', line)[1])[0]) #find the nsites to count the frequency count for .pfm file + + elif line.startswith(' '): #each line with information about frequency starts in MEME format with ' ' + for i in range(len(rows)): + rows[i].append(str(round(float(re.findall(r'\S+', line)[i])*nsites))) #split the line, do not mention how much whitespaces are in between, multiply it with nsites and save it to the corresponding row + + lines = lines + 1 + + #write the last motif + for row in rows: + written_file.write('\t'.join(row) + '\n') + + written_file.close() + read_file.close() + + return splitted_motifs + +def convert_pfm_to_meme(motifs, output_directory): + #i can only convert the file to meme, if motifs file is in .pfm format + + #first we need to split the pfm motifs as the jaspar2meme does not work with the files containing several motifs, but with the directory consisting files each with only one motif in pfm format + pfm_motifs = read_pfm(motifs, output_directory) + + converted_name = os.path.join(output_directory, "converted_motifs.meme") + + os.system("jaspar2meme -pfm " + output_directory + " > " + converted_name) + + #need to call split motifs for meme file + splitted_motifs = split_motifs(converted_name, output_directory, "fimo") + + remove_file(converted_name) + return splitted_motifs + +#if there are chars that are not allowed, they will be replaced with '_', to the possibly invalid names there will be added '_' at the beginning of the name +def check_name(name_to_test): + badchars= re.compile(r'[^A-Za-z0-9_. ]+|^\.|\.$|^ | $|^$') + badnames= re.compile(r'(aux|com[1-9]|con|lpt[1-9]|prn)(\.|$)') + + #replace all the chars that are not allowed with '_' + name = badchars.sub('_', name_to_test) + + #check for the reserved by the os names + if badnames.match(name): + name = '_' + name + return name + +#use fimo to make a file +def call_fimo(fimo_data, p_value, one_motif, genome, output_directory): + + #make the filename for the fimo output + fimo_output_file = os.path.join(output_directory, "fimo_output_" + os.path.splitext(os.path.basename(one_motif))[0] + ".txt") + fimo_output_unsorted = os.path.join(output_directory, "fimo_output_unsorted_" + os.path.splitext(os.path.basename(one_motif))[0] + ".txt") + + #check if user needs special options for the fimo + if fimo_data != None: + fimo_data = fimo_data + " --thresh " + str(p_value) + " " + else: + fimo_data = "--thresh " + str(p_value) + " " #add the passed p value to the fimo options + + #call fimo for this motif and save the output to a temporary file + send_to_fimo = "fimo --text --no-qvalue " + fimo_data + one_motif + " " + genome + " > " + fimo_output_unsorted + + logger.info('fimo proceed the data using this call ' + send_to_fimo) + + fimo_stdout = subprocess.getoutput(send_to_fimo) + + #the normal output from fimo starts with Using motif ..., so print it from the logger, otherwise print what else fimo says + if fimo_stdout.startswith("Using") and re.split('\n', fimo_stdout)[1]: + logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[0].rstrip()) + logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[1].rstrip()) + else: #there were some problems with fimo, so we want to see what they were + logger.info('info from fimo: ' + fimo_stdout) + + + if not os.path.isfile(fimo_output_unsorted): + logger.info('the usage of fimo was crashed, there is no required output file, the exit is forced') + sys.exit() + + if os.stat(fimo_output_unsorted).st_size == 0: #if the output of fimo is empty + fimo_output_unsorted = fimo_output_unsorted.replace('unsorted_', '') + return fimo_output_unsorted + else: + #if the file was converted from pfm, the second column contains the positions, so we want to sort using this column, and not the next one + second_line = subprocess.getoutput("sed -n '2{p;q}' " + fimo_output_unsorted) + if re.split('\t', second_line)[2].startswith("chr"): #the re.split[1] is a ' ', so take the [2] + os.system("cat " + fimo_output_unsorted + " | sort -k 2 -V > " + fimo_output_file) + else: + #we are sorting after the third column, which looks like chr1:123-126, -V means it will sort the digitals and not the strings + os.system("cat " + fimo_output_unsorted + " | sort -k 3 -V > " + fimo_output_file) + + #make sure the output of fimo exists + if not os.path.isfile(fimo_output_file): + logger.info('the sorting of the output file from the fimo was crashed, the exit is forced') + sys.exit() + else: + return fimo_output_file + +def call_moods(one_motif, genome, output_directory, p_value, moods_bg): + + # setting standard parameters for moods + pseudocount = 0.0001 + + if moods_bg == None: + bg = MOODS.tools.flat_bg(4) + else: + bg = tuple(moods_bg) + + logger.info("moods will work with the p_value " + str(p_value) + " and the bg " + str(bg)) + + motif_name = os.path.basename(one_motif) + + moods_output_unsorted_name = os.path.join(output_directory, "moods_output_unsorted_" + os.path.splitext(motif_name)[0] + ".txt") + moods_output_file_unsorted = open(moods_output_unsorted_name, 'w') + + moods_output_name = os.path.join(output_directory, "moods_output_" + os.path.splitext(motif_name)[0] + ".txt") + moods_output_file = open(moods_output_name, 'w') + + matrix_names = [os.path.basename(one_motif)] + + matrices = [] + matrices_rc = [] + + valid, matrix = pfm_to_log_odds(one_motif, bg, pseudocount) + + key_for_bed_dict = '' + + if valid: + + logger.info("please be patient, moods is working on the data") + + matrices.append(matrix) + matrices_rc.append(MOODS.tools.reverse_complement(matrix,4)) + matrices_all = matrices + matrices_rc + thresholds = [MOODS.tools.threshold_from_p(m, bg, p_value, 4) for m in matrices_all] + + scanner = MOODS.scan.Scanner(7) + scanner.set_motifs(matrices_all, bg, thresholds) + + with open(genome) as handle: + + seq_iterator = bio.SimpleFastaParser(handle) + + for header, seq in seq_iterator: + + header_splitted = re.split(r':', header) + + if len(header_splitted) == 1: #if there are no positions given + header = header + ":0-" #set the first position as 0 and split it once more + header_splitted = re.split(r':', header) + logger.info("moods works with " + header) + else: #the given genome file is a file with peaks, so use the header of the peak as a key to search in the bed dictionary for additional information later on + key_for_bed_dict = header + + chromosom = header_splitted[0] + positions = re.split(r'-', header_splitted[-1]) + + results = scanner.scan(seq) + + fr = results[:len(matrix_names)] #forward strand + rr = results[len(matrix_names):] #reverse strand + + results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + + [(r.pos, r.score, '-', ()) for r in rr[i]] for i in range(len(matrix_names))] #use + and - to indicate strand + + for (matrix, matrix_name, result) in zip(matrices, matrix_names, results): + + motif_id = re.split(r'_', matrix_name)[-1] #find the id of the given morif + motif_alternate_name = matrix_name.replace(motif_id, '')[:-1] #the alternate name of the motif is the name of the file without id and with cutted last character, that is _ + + if len(matrix) == 4: + l = len(matrix[0]) + if len(matrix) == 16: + l = len(matrix[0] + 1) + for r in sorted(result, key=lambda r: r[0]): + strand = r[2] + pos = r[0] + hitseq = seq[pos:pos+l] #sequence + #score = r[1] + score = format(r[1], '.15f') #round to 15 digits after floating point, already type str + + if key_for_bed_dict != '': + start = pos + 1 + end = pos + len(hitseq) + chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file + else: + start = int(positions[0]) + pos + 1 + end = start + len(hitseq) - 1 + + #moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, str(score)]) + '\n') + moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, score]) + '\n') + + #now sort the output of moods + os.system("cat " + moods_output_unsorted_name + " | sort -k 1 -V > " + moods_output_name) + + moods_output_file_unsorted.close() + moods_output_file.close() + + return moods_output_name + + else: + logger.info("The input for moods was not validated by the MOODS.parsers.pfm. Please check if it has the right format (note that the MOODS accepts only the old version of .pfm files, that is one without the header containing the name and id of the motif)") + sys.exit() + +#help function for the moods call, convert pfm to log odds +def pfm_to_log_odds(filename, bg, pseudocount): + if pfm(filename): + mat = MOODS.parsers.pfm_to_log_odds(filename, bg, pseudocount) + if len(mat) != 4: #if something went wrong, the empty list will be returned + return False, mat + else: + return True, mat + else: + logger.info('please make sure the motif file has a .pfm format needed for moods') + sys.exit() + +#help function for the moods call, check if the file is in a pfm format using moods +def pfm(filename): + mat = MOODS.parsers.pfm(filename) + if len(mat) != 4: + return False + else: + return True + +# calculate the real positions of TFs, if needed, resolve the overlaps, and write to the output file +def write_output_file(input_file, bed_dictionary, resolve_overlaps): + + if os.path.basename(input_file).startswith("moods"): + name_without_moods_or_fimo = input_file.replace('moods_output_', '') + used_tool = "moods" + else: + name_without_moods_or_fimo = input_file.replace('fimo_output_', '') + used_tool = "fimo" + + output_file_name = os.path.splitext(name_without_moods_or_fimo)[0] + ".bed" + + logger.info('writing the output file ' + output_file_name) + output_file = open(output_file_name, 'w') + + #calculate the real positions of TFs and write the information in the output file + with open(input_file) as read_file: + + overlap = [] + printed_line = [] + last_line = [] + key_for_bed_dict = '' + + for line in read_file: + if not line.startswith('#'): + + line_to_write = [] + + line_array = re.split(r'\t', line.rstrip('\n')) + + chromosom_and_positions = re.split(r':', line_array[2]) + if len(chromosom_and_positions) == 1: #the whole genome was given, there is nothing to split + chromosom = line_array[2] + start = line_array[3] + end = line_array[4] + else: + positions = re.split(r'-', chromosom_and_positions[-1]) + chromosom = chromosom_and_positions[0] + start = str(int(positions[0]) + int(line_array[3])) + end = str(int(positions[0]) + int(line_array[4])) + key_for_bed_dict = line_array[2] #use only if there is a bed_dictionary in input + + #------- these are 5 needed columns to succesfully proceed the data + + name = os.path.splitext(os.path.basename(name_without_moods_or_fimo))[0] + + score = line_array[6] + + line_to_write.extend([chromosom, start, end, name, score]) + #------ here the additional information coule be added to the output file + + strand_inf = line_array[5] + line_to_write.append(strand_inf) + + if used_tool == "fimo": + p_value = line_array[7] + line_to_write.append(p_value) + + #if the dictionary is not empty check for the information corresponding to these positions + if bed_dictionary and key_for_bed_dict in bed_dictionary: + line_to_write.append('\t'.join(bed_dictionary[key_for_bed_dict])) + + line_to_write.insert(0, "write") #insert a flag to know if the line should be written or not + + last_line = line_to_write #save the line in case it is the last line and if due to the check_overlap it could not be printed + + if resolve_overlaps: + overlap, line_to_write, printed_line = check_overlap(line_to_write, overlap, printed_line, output_file) + + write_line_not_overlap(output_file, line_to_write) + + if not last_line[0].startswith('write'): #it could be that the write flag was deleted from the last_line so append it back + overlap.insert(0, "write") + + #if there is already any printed line, check if the saved last line was already printed. Otherwise print it + if resolve_overlaps: + if printed_line: + if last_line[1] != printed_line[1] or last_line[2] != printed_line[2]: + write_line_not_overlap(output_file, last_line) + + output_file.close() + +def write_line_not_overlap(output_file, line_to_write): + if line_to_write: #if line_to_write is not empty + if line_to_write[0].startswith('write'): #if the line does not contain an overlap, it contains the flag "write" at the first position + line_to_write.pop(0) #delete the flag + output_file.write('\t'.join(line_to_write) + '\n') + +def check_overlap(line_to_write, overlap, printed_line, output_file): + + is_overlap = None + + if not overlap: #if the overlap list is empty + is_overlap = False + + else: #if the overlap list is not empty + + if not overlap[0].startswith('write'): #it could be that the write flag was deleted from the overlap so append it back to make sure the next if clauses run right + overlap.insert(0, "write") + + if line_to_write[1] == overlap[1] and float(line_to_write[2]) < float(overlap[3]): #the current line could overlap the previous because the start of the current line is smaller than the end of the previous one and they are both on the one chromosom + #if p value in the current line is smaller than p value in the previous line (or score is bigger), save the current line as possible overlap for future + if float(line_to_write[5]) > float(overlap[5]): + is_overlap = False + else: + #if the p value in the current line is greater than p value in the previous line or are these both p values the same, the current line will not be printed, but also it will not be saved + line_to_write.pop(0) + is_overlap = None + else: #it is an other chromosom or the start at the current line is greater or the same as the end of the previous one + if printed_line != overlap: + is_overlap = True + else: + is_overlap = False + + if is_overlap == False: + overlap = line_to_write #save the current line + line_to_write.pop(0) #do not print anything due to deleting the flag ("write") + elif is_overlap == True: + if not overlap[0].startswith('write'): + overlap.insert(0, "write") + printed_line = overlap #the previous line is saved as temporary line to print it later on + overlap = line_to_write #save the current line + line_to_write = printed_line #print the temporary saved line + + return overlap, line_to_write, printed_line + +def remove_file(file): + if os.path.isfile(file): + os.remove(file) + +def clean_directory(cleans, output_directory, motif, tool_output_file): + + fimo_output_unsorted = os.path.join(tool_output_file.replace("fimo_output", "fimo_output_unsorted")) + moods_output_unsorted = os.path.join(tool_output_file.replace("moods_output", "moods_output_unsorted")) + + for clean in cleans: + if clean == 'all': + remove_file(motif) + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + remove_file(moods_output_unsorted) + elif clean == 'fimo_output': + if os.path.basename(tool_output_file).startswith("fimo"): + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + elif clean == 'cut_motifs': + remove_file(motif) + elif clean == 'moods_output': + if os.path.basename(tool_output_file).startswith("moods"): + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + +def tool_make_output(usage, motif, genome, output_directory, cleans, p_value, bed_dictionary, fimo_data, resolve_overlaps, moods_bg): + try: + if usage == "fimo": + tool_output_file = call_fimo(fimo_data, p_value, motif, genome, output_directory) + elif usage == "moods": + tool_output_file = call_moods(motif, genome, output_directory, p_value, moods_bg) + + output = write_output_file(tool_output_file, bed_dictionary, resolve_overlaps) + finally: + clean_directory(cleans, output_directory, motif, tool_output_file) + +def multiprocess(motifs, genome, output_directory, cleans, fimo_data, p_value, bed_dictionary, cpu_count, resolve_overlaps, usage, moods_bg): + + if cleans == None: + cleans = ['all'] + + pool = multiprocessing.Pool(cpu_count) #by default is cpu_count 2 + + length = len(motifs) #the number of the motifs to find the percentage of the job that was done + step = 100/length #the percentage that should be added after the job with each single motif is done + + tasks = [] #here the jobs done by processes are saved + + for motif in motifs: + tasks.append(pool.apply_async(tool_make_output, args = (usage, motif, genome, output_directory, cleans, p_value, bed_dictionary, fimo_data, resolve_overlaps, moods_bg, ))) + + tasks_done = sum([task.ready() for task in tasks]) #the number of the processes that ended their job + + #check the number of the processes that are ready till the number of them reaches the number of processes started in the pool + while tasks_done < len(tasks): + #if the number of ready processes has changed, save the new number and print the percentage of the job done + if sum([task.ready() for task in tasks]) != tasks_done: + tasks_done = sum([task.ready() for task in tasks]) + sys.stdout.write("%-100s| %d%% \r" % (''*tasks_done, step*tasks_done)) + sys.stdout.flush() + sys.stdout.write("\n") + #update the number of ready processes each 0.05 seconds + time.sleep(0.05) + + pool.close() + pool.join() #make sure all the processes are done and exit + + #the processes should not delete the merged genome file + #so make sure if this file is needed, otherwise delete it + for clean in cleans: + if clean == 'all' or clean == 'merge_output': + for filename in os.listdir(output_directory): + if filename == "output_merge.fa": + remove_file(genome) + + if clean != 'nothing': + logger.info('the directory ' + output_directory + ' was cleaned, only the required files were left') + +def make_bed_dictionary(bed_file): + + bed_dictionary = {} + + with open(bed_file) as read_bed_file: + for bed_line in read_bed_file: + bed_line_array = re.split(r'\t', bed_line.rstrip('\n')) + if bed_line_array[1].isdigit() and bed_line_array[2].isdigit() and int(bed_line_array[1]) <= int(bed_line_array[2]): #in the real bedfile the second column is a start position, and the third column is an end position, so we are checking if these are integers and if the start position is smaller than the end one + key = bed_line_array[0] + ":" + bed_line_array[1] + "-" + bed_line_array[2] + value = [] + for i in range(3, len(bed_line_array)): + value.append(bed_line_array[i]) + + bed_dictionary[key] = value + else: #this is not a bed file, force exit + logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line) + sys.exit() + + read_bed_file.close() + + return bed_dictionary + +def is_fasta(check_fasta): + if not os.path.isfile(check_fasta): + logger.info('there is no file with genome, the exit is forced') + sys.exit() + else: + # modified code from https://stackoverflow.com/questions/44293407/how-can-i-check-whether-a-given-file-is-fasta + + with open(check_fasta, "r") as handle: + fasta = SeqIO.parse(handle, "fasta") + return any(fasta) # False when `fasta` is empty, i.e. wasn't a FASTA file + +def check_existing_input_files(args): + if not args.motifs or not args.genome: + logger.info('there is no satisfied input, please enter --help or -h to learn how to use this tool') + sys.exit() + elif not is_fasta(args.genome): + logger.info('please make sure the input genome file has a fasta format') + sys.exit() + #check if the file with motifs exists + elif not os.path.isfile(args.motifs): + logger.info('there is no file with motifs, the exit is forced') + sys.exit() + #if the bedfile was given as input, check if this file exists + elif args.bed_file: + if not os.path.isfile(args.bed_file): + logger.info('there is no such bed file ' + args.bed_file + ', the exit is forced') + sys.exit() + +def check_fimo_version(): + fimo_version = subprocess.getoutput("fimo --version") #works only on python 3.4 + #fimo_version = int(fimo_version.replace('.', '')) #replace the . in the version to be able to compare it as int + if fimo_version < "4.12.0": + logger.info('please make sure you are using fimo version 4.12.0 or the newer one') + sys.exit() + +def main(): + + + args = parse_args() + + if args.use == "fimo": + check_fimo_version() + + #if user do not want to see the information about the status of jobs, remove the handler, that writes to the terminal + if args.hide_info: + #logger.disabled = True + logger.removeHandler(ch) + + check_existing_input_files(args) + + #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user + check_directory(args.output_directory) + + splitted_motifs = split_motifs(args.motifs, args.output_directory, args.use) + + #check if there is a .bed file to merge the genome file with. If so, merge them + if args.bed_file: + bed_dictionary = make_bed_dictionary(args.bed_file) + args.genome = merge(args.genome, args.bed_file, args.output_directory) + else: + bed_dictionary = {} + + #if the usage is moods, call moods, otherwise call fimo + multiprocess(splitted_motifs, args.genome, args.output_directory, args.cleans, args.fimo, args.p_value, bed_dictionary, args.cores, args.resolve_overlaps, args.use, args.moods_bg) + + for handler in logger.handlers: + handler.close() + logger.removeFilter(handler) + +if __name__ == "__main__": + main() \ No newline at end of file From b3fabce937f2206dbebbdd12f2a72226bac31e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:25:49 -0500 Subject: [PATCH 16/25] added and moved parameter in config --- config/cluster.config | 1 - config/motif_estimation.config | 7 ++++--- nextflow.config | 13 +++++++------ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/config/cluster.config b/config/cluster.config index 3dce063..0a21a05 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -3,5 +3,4 @@ params{ kmer=10 aprox_motif_len=10 - threads=1 } diff --git a/config/motif_estimation.config b/config/motif_estimation.config index a00fc86..d280b3e 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,11 +1,12 @@ params { //bed_to_clustered_fasta min_seq = 10 - + //glam2 - motif_min_len = 10 + motif_min_len = 8 motif_max_len = 20 + interation = 10000 //tomtom tomtom_treshold = 0.01 -} \ No newline at end of file +} diff --git a/nextflow.config b/nextflow.config index 2e84338..c0a683b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,12 +1,13 @@ createTimeout = 60 - -env { - path_bin = $workflow.projectDir/bin - path_env = $workflow.projectDir/masterenv.yml -} +params.threads=1 //Parameter for for scripts! Not for nextflow processes. includeConfig '$workflow.projectDir/config/peak_calling.config' -includeConfig '$workflow.projectDir/config/1.2.config' +includeConfig '$workflow.projectDir/config/filter_unknown_motifs.config' includeConfig '$workflow.projectDir/config/cluster.config' includeConfig '$workflow.projectDir/config/motif_estimation.config' includeConfig '$workflow.projectDir/config/create_gtf.config' + +env { + path_bin = $workflow.projectDir/bin + path_env = $workflow.projectDir/masterenv.yml +} From 760d91d2126ca2127f2e1a7a9c077d905d37cf48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:26:29 -0500 Subject: [PATCH 17/25] added default parameter in script --- pipeline.nf | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index 071771a..194bf38 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -15,6 +15,38 @@ params.genome_fasta="" params.jaspar_db="" params.config="" +//peak_calling + params.window_length = 200 + params.step = 100 + params.percentage = 0 + +//filter_unknown_motifs + params.min_size_fp=10 + params.max_size_fp=100 + +//clustering + params.sequence_coverage=8 + + params.kmer=10 + params.aprox_motif_len=10 + +//motif_estimation + //bed_to_clustered_fasta + params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2 + + //glam2 + params.motif_min_len = 8 // Minimum length of Motifs + params.motif_max_len = 20 // Maximum length of Motifs + params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals higher more accurate but higher runtime. + + //tomtom + params.tomtom_treshold = 0.01 // threshold for similarity score. + +//creating_gtf + params.organism="homo_sapiens" + params.tissue="" + + process footprint_extraction { conda "${path_env}" @@ -143,7 +175,7 @@ process glam2 { script: """ - glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 + glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 -n ${params.interation} """ } From ef3343846e651c7e69b4c094ca131a1661a056e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:29:45 -0500 Subject: [PATCH 18/25] moved uropa.config to config dir --- uropa.config => config/uropa.config | 0 nextflow.config | 1 + 2 files changed, 1 insertion(+) rename uropa.config => config/uropa.config (100%) diff --git a/uropa.config b/config/uropa.config similarity index 100% rename from uropa.config rename to config/uropa.config diff --git a/nextflow.config b/nextflow.config index c0a683b..a3c06eb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,5 +1,6 @@ createTimeout = 60 params.threads=1 //Parameter for for scripts! Not for nextflow processes. +params.config="${workflow.projectDir}/config/uropa.config" includeConfig '$workflow.projectDir/config/peak_calling.config' includeConfig '$workflow.projectDir/config/filter_unknown_motifs.config' From 4ff3088b0c5e39b8523cde20e92d2ef5e2fc1e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Wed, 5 Dec 2018 10:53:43 -0500 Subject: [PATCH 19/25] added help message --- pipeline.nf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pipeline.nf b/pipeline.nf index 194bf38..3610955 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -47,6 +47,14 @@ params.config="" params.tissue="" +if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){ +log.info """ +NAME +Usage: nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] +""" +} + + process footprint_extraction { conda "${path_env}" From 01f34921627f921002604c4c89e30079c75180ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 6 Dec 2018 03:47:49 -0500 Subject: [PATCH 20/25] added new config file 'filter_unknown_motifs' --- config/filter_unknown_motifs.config | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 config/filter_unknown_motifs.config diff --git a/config/filter_unknown_motifs.config b/config/filter_unknown_motifs.config new file mode 100644 index 0000000..79033c4 --- /dev/null +++ b/config/filter_unknown_motifs.config @@ -0,0 +1,4 @@ +params{ + min_size_fp=10 + max_size_fp=100 +} From 1d466c5417030b9a438ff91685c2482f69447c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 6 Dec 2018 03:58:32 -0500 Subject: [PATCH 21/25] help message --- pipeline.nf | 56 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 3610955..e694ba0 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -6,8 +6,6 @@ Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} -bigwig_input.combine(bed_input).set{footprint_in} - //setting default values params.input="" params.bed="" @@ -37,7 +35,7 @@ params.config="" //glam2 params.motif_min_len = 8 // Minimum length of Motifs params.motif_max_len = 20 // Maximum length of Motifs - params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals higher more accurate but higher runtime. + params.interation = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. //tomtom params.tomtom_treshold = 0.01 // threshold for similarity score. @@ -49,18 +47,54 @@ params.config="" if (params.input == "" || params.bed == "" || params.genome_fasta == "" || params.jaspar_db == "" || params.config == ""){ log.info """ -NAME -Usage: nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] +Usage: nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] + +Required arguments: + --input Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --jaspar_db Path to motif-database in MEME-format + + +Optional arguments: + Footprint extraction: + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT(Default: 0) + + Filter unknown motifs: + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: + --sequence_coverage INT (Default: 8) + --kmer INT (Default: 10) + --aprox_motif_len INT (Default: 10) + + Motif estimation: + --motif_min_len INT Minimum length of Motif (Default: 8) + --motif_max_len INT Maximum length of Motif (Default: 20) + --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) + --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + + Creating GTF: + --organism [homo_sapiens | mus_musculus] + --tissues + +All arguments can be set in the configuration files. """ } +bigwig_input.combine(bed_input).set{footprint_in} + + process footprint_extraction { conda "${path_env}" tag{name} - publishDir '${out}', mode: 'copy', pattern: '*.bed' - publishDir '/mnt/agnerds/Rene.Wiegandt/log', mode: 'copy', pattern: '*.log' + publishDir "${out}", mode: 'copy', pattern: '*.bed' + publishDir "${out}/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -257,7 +291,7 @@ process get_best_motif { best_motif.combine(fa_scan).set {files_for_genome_scan} - +/* process genome_scan { conda "${path_env}" @@ -284,7 +318,7 @@ process cluster_quality { script: """ """ -} +} */ process create_GTF { @@ -301,7 +335,7 @@ process create_GTF { """ } - +/* bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} @@ -348,4 +382,4 @@ process filter { script: """ """ -} +} */ From 688cb007bdb097a8a5c00c18a5986c8c4d05397c Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:02:21 +0100 Subject: [PATCH 22/25] Update README.md --- README.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/README.md b/README.md index 4cf5660..d746944 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,43 @@ source activate masterenv nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config ``` ## Parameter +``` +Required arguments: + --input Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --jaspar_db Path to motif-database in MEME-format + + +Optional arguments: + Footprint extraction: + --window_length INT (Default: 200) + --step INT (Default: 100) + --percentage INT(Default: 0) + + Filter unknown motifs: + --min_size_fp INT (Default: 10) + --max_size_fp INT (Default: 100) + + Clustering: + --sequence_coverage INT (Default: 8) + --kmer INT (Default: 10) + --aprox_motif_len INT (Default: 10) + + Motif estimation: + --motif_min_len INT Minimum length of Motif (Default: 8) + --motif_max_len INT Maximum length of Motif (Default: 20) + --interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000) + --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + + Creating GTF: + --organism [homo_sapiens | mus_musculus] + --tissues + + All arguments can be set in the configuration files. + ``` + + For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) From 65e17ce0d85d483e878696b7d014bb6638aa031d Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:02:54 +0100 Subject: [PATCH 23/25] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d746944..f457df8 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ source activate masterenv ```console nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config ``` -## Parameter +## Parameters ``` Required arguments: --input Path to BigWig-file From 0bdc1016eba520a8a371600db9cf57c82c9348b2 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:03:35 +0100 Subject: [PATCH 24/25] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f457df8..6431022 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ source activate masterenv ## Quick Start ```console -nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config +nextflow run pipeline.nf --input [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --jaspar_db [MEME-file] ``` ## Parameters ``` From a62154ca9c57826cf163599ec5b27b8ac40cbd25 Mon Sep 17 00:00:00 2001 From: renewiegandt Date: Thu, 6 Dec 2018 10:34:44 +0100 Subject: [PATCH 25/25] Update README.md --- README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6431022..f36beca 100644 --- a/README.md +++ b/README.md @@ -10,13 +10,23 @@ De novo motif discovery and evaluation based on footprints identified by TOBIAS ## Installation Start with installing all dependencies listed above. Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -The Nextflow-script needs a conda enviroment to run. To create this enviroment you need the yml-file from the repository. +The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. +On some systems Nrxtflow exits the run with following error: +``` +Caused by: + Failed to create Conda environment + command: conda env create --prefix --file env.yml + status : 143 + message: +``` +If this error occurs you have to create the enviroment before starting the pipeline. +To create this enviroment you need the yml-file from the repository. Run the following commands to create the enviroment: ```console path=[Path to given masterenv.yml file] conda env create --name masterenv -f=$path -source activate masterenv ``` +When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. ## Quick Start ```console