From 603c814f4120e2580b760c698e71558d43107c8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 8 Nov 2018 04:50:05 -0500 Subject: [PATCH 1/8] added uropa config --- uropa.config | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 uropa.config diff --git a/uropa.config b/uropa.config new file mode 100644 index 0000000..3c4a189 --- /dev/null +++ b/uropa.config @@ -0,0 +1,16 @@ +{ +"queries":[ + {"feature":"", + "feature.anchor": "", + "distance": [ , ], + "strand":"", + "direction":"", + "internals":"", + "filter.attribute":"", + "attribute.value":"", + "show.attributes":"" } + ], +"priority": "", +"gtf": "placeholder_gtf", +"bed": "placeholder_bed" +} \ No newline at end of file From 9f4e4809cdb59e43f02e0d1b7caf4f9ab2c8bed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 8 Nov 2018 04:51:21 -0500 Subject: [PATCH 2/8] added process create_uropa_config --- pipeline.nf | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pipeline.nf b/pipeline.nf index 49adb09..0609bb1 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -3,6 +3,7 @@ Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan} Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} +Channel.fromPath(params.config).set {config} process footprint_extraction { @@ -212,15 +213,23 @@ process create_GTF { } -process create_UROPA_config { +bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} + + +process create_uropa_config { + + publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' input: + set val(bed), val(gtf) from uropa_in.toList() + file (conf) from config output: file ('uropa.config') into uropa_config script: """ + sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final """ } From 87bbda0c4cd6ccd1d5507f54e480aa975f13813d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 8 Nov 2018 07:26:59 -0500 Subject: [PATCH 3/8] added scripts for part 1.2 --- bin/compareBed.sh | 183 ++++++++++++++++++++++++++++++++++++++++++++++ bin/maxScore.R | 9 +++ bin/merge.R | 23 ++++++ 3 files changed, 215 insertions(+) create mode 100644 bin/compareBed.sh create mode 100644 bin/maxScore.R create mode 100644 bin/merge.R diff --git a/bin/compareBed.sh b/bin/compareBed.sh new file mode 100644 index 0000000..bcf52fb --- /dev/null +++ b/bin/compareBed.sh @@ -0,0 +1,183 @@ +#!/bin/bash +#data +#motifs +#workdir +#fasta +#min +#max +#output +wrong=() +da=false +mo=false +wo=false +fa=false +mi=false +ma=false +ou=false +he=false + +if [ $# -eq 0 ] +then + he=true +fi + +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + -d|--data) + data="$2" + da=true + shift + shift + ;; + -m|--motifs) + motifs="$2" + mo=true + shift + shift + ;; + -w|--workdir) + workdir="$2" + wo=true + shift + shift + ;; + -f|--fasta) + fasta="$2" + fa=true + shift + shift + ;; + -min|--min) + min=$2 + mi=true + shift + shift + ;; + -max|--max) + max=$2 + ma=true + shift + shift + ;; + -o|--output) + output="$2" + ou=true + shift + shift + ;; + -h|--help) + help=true + he=true + shift + ;; + *) + wrong+=("$1") + shift + ;; +esac +done + +count=${#wrong[@]} +if [ $count -gt 0 ] +then + for i in ${wrong[@]} + do + echo wrong parameter $i + echo call script without parameters for help or call --help + echo exit + done +exit 1 +fi + +if [ $he == true ] +then + echo "This script utilies bedtools to select new footprints from data." + echo "Therefore the data is compared with known footprint motifs." + echo "The output is a new .bed file with added sequence information." + echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." + echo "If a parameter is chosen, a value must be provided or an error will occur." + echo "--------------------" + echo "usage: compareBed.sh --data \ --motifs \ --fasta \ \[optional_parameter \\] ..." + echo "--------------------" + echo " required parameter:" + echo " -d --data the path to the .bed file of the footprints" + echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -f --fasta the path to the .fasta file of genome" + echo " " + echo " optional parameter:" + echo " -w --workdir the path to directory where temporary files will be created" + echo " default is a new directory that is created in the current directory" + echo " -min --min minimum size of footprints\; default is 10" + echo " -max --max maximum size of footprints\; default is 60" + echo " -o --output output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" + echo " -h --help shows this help message" +exit 0 +fi + +echo selected parameters +echo ------------------- +echo data: $da \(required\) +echo motifs: $mo \(required\) +echo workdir: $wo +echo fasta: $fa \(required\) +echo min: $mi +echo max: $ma +echo output: $ou +echo help: $he + +if [ $da == false ] || [ $mo == false ] || [ $fa == false ] +then + echo required parameters not given. + echo required are: --data \ --motifs \ --fasta \ + exit 1 +fi + +if [ $wo == false ] +then + if [ ! -d workdir1337 ] + then + mkdir workdir1337 + fi + wo=true + workdir="workdir1337" +fi + +if [ $ou == false ] +then + output="newFootprints.bed" + ou=true +fi + +if [ $mi == false ] +then + min=10 + mi=true +fi + +if [ $ma == false ] +then + max=60 + ma=true +fi +#1. first filter. no overlap vs. overlap +bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed +bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed + +#2. compute absolut maxscore position +Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed + +#3. subtract overlapping regions for additional motifs +bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed + +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +Rscript --vanilla merge.R $min $max "$workdir" + +#5. add fasta sequences to bed and create fasta file +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" +bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta + +#6 clean up +rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file diff --git a/bin/maxScore.R b/bin/maxScore.R new file mode 100644 index 0000000..4ea545b --- /dev/null +++ b/bin/maxScore.R @@ -0,0 +1,9 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +args = commandArgs(TRUE) +file = args[1] + +tab = read.table(file, header=FALSE) +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +tab$maxpos = tab$start + tab$maxpos + +write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge.R b/bin/merge.R new file mode 100644 index 0000000..bbe3c9a --- /dev/null +++ b/bin/merge.R @@ -0,0 +1,23 @@ +#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +args=commandArgs(TRUE) +min=as.numeric(args[1]) +max=as.numeric(args[2]) +folder=args[3] +splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +p1$maxpos = p1$start + p1$maxpos + +splitted=rbind(splitted, p1) + +splitted=splitted[which(splitted$stop - splitted$start >= min),] +splitted=splitted[which(splitted$stop - splitted$start <= max),] +splitted$id=make.unique(as.character(splitted$id)) +splitted$length=splitted$stop - splitted$start + +splitted=cbind(splitted, containsMaxpos=0) +splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 +splitted$maxpos = splitted$maxpos - splitted$start +write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') + From da7736423dc4cf5effc7797ebcd742ebc5f7b32d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 8 Nov 2018 07:28:05 -0500 Subject: [PATCH 4/8] Update: process overlap_with_known_TFBS --- pipeline.nf | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 0609bb1..83e4736 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,7 +1,7 @@ //!/usr/bin/env nextflow Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} -Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan} +Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} @@ -13,14 +13,14 @@ process footprint_extraction { set name, file (bigWig) from bigwig_input output: - set name, file ('*.bed') into bed_for_overlapp_with_TFBS + set name, file ('*.bed') into bed_for_overlap_with_TFBS script: """ """ } - +//Abfrage ob ausgeführt werden muss. process extract_known_TFBS { input: @@ -36,16 +36,20 @@ process extract_known_TFBS { } +bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap} + + process overlap_with_known_TFBS { input: - file (TFBS) from known_TFBS_for_overlap + set file (bed_footprints), file (bed_motifs), file (fasta) from for_overlap output: file ('*.FASTA') into FASTA_for_clustering script: """ + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${bed_motifs} --fasta ${fasta} -o ${name_placeholder} -min ${params.min_size_fp} -max ${params.max_size_fp} """ } From 1a6ce69573c213fad5655810a12eed2b6d43940a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Mon, 19 Nov 2018 04:43:48 -0500 Subject: [PATCH 5/8] added script for process 'footprint_extraction' --- pipeline.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 83e4736..dd67c85 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -58,7 +58,7 @@ process overlap_with_known_TFBS { process clustering { input: - file (faste) from FASTA_for_clustering + file (fasta) from FASTA_for_clustering output: set name, file ('*.bed') into bed_for_motif_esitmation @@ -132,7 +132,7 @@ process tomtom { } -//Joining channels with meme and tsv files. Filter joined channel on linecount. +//Joining channels with meme and tsv files. Filter joined channel on line count. //Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. for_filter = meme_for_filter.join( tsv_for_filter ) for_filter @@ -143,7 +143,7 @@ for_filter .into { meme_for_scan; check } -//If unknown channel is empty print errormessage +//If channel 'check' is empty print errormessage process check_for_unknown_motifs { echo true @@ -160,6 +160,7 @@ process check_for_unknown_motifs { } +//Get the best(first) Motif from each MEME-file process get_best_motif { input: @@ -220,6 +221,8 @@ process create_GTF { bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} +// Create configuration file for UROPA. +// Takes template and replaces bed- and gtf-placeholders with actual paths. process create_uropa_config { publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' From 80ffd2a7ae43dce8877863fb0decbdd128d26ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Mon, 19 Nov 2018 06:35:08 -0500 Subject: [PATCH 6/8] added script for process 'footprint_extraction' --- pipeline.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/pipeline.nf b/pipeline.nf index dd67c85..45db184 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -17,6 +17,7 @@ process footprint_extraction { script: """ + python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --threshold ${params.threshold_footprint_extraction} """ } From a5e902fa670b55a3ed08075adc827e085308b80b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 29 Nov 2018 07:53:04 -0500 Subject: [PATCH 7/8] added filter for small clusters --- bin/bed_to_fasta.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 6767ab5..09e3f4f 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -4,18 +4,25 @@ # The Sequences of each cluster are writen as an FASTA-file. # @parameter bedInput BED-file with sequences and cluster-id as column"TEs # @parameter prefix prefix for filenames +# @parameter min_seq min. number of sequences per cluster args = commandArgs(trailingOnly = TRUE) bedInput <- args[1] -prefix <- args[2] # "Fasta" +prefix <- args[2] +min_seq <- args[3] bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") -clusters <- split(bed, bed$V3, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster +clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster discard <- lapply(1:length(clusters), function(i){ - sequences <- as.list(as.data.frame(clusters[i])[[2]]) # <---- Splate mit Sequenz - outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = as.data.frame(clusters[i])[[1]] - , file.out = outfile, as.string = TRUE) # <---- Spalte mit Name + clust <- as.data.frame(clusters[i]) + print(nrow(clust)) + if (nrow(clust) >= as.numeric(min_seq) ) { + sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz + outfile <- paste0(prefix,"_cluster_",i,".FASTA") + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name + } else { + print(paste0("Cluster: ",i," is to small")) + } }) From 420cb1f07ece20199ccaaa6bd157074394b7831e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wiegandt?= Date: Thu, 29 Nov 2018 07:53:52 -0500 Subject: [PATCH 8/8] changed tomtom/glam2 parameters; added min_seq parameter --- pipeline.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pipeline.nf b/pipeline.nf index 45db184..e4d502f 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -5,6 +5,8 @@ Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} Channel.fromPath(params.config).set {config} +params.min_seq = 10 + process footprint_extraction { tag{name} @@ -84,7 +86,7 @@ process bed_to_clustered_fasta { script: """ - Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} + Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} """ } @@ -107,7 +109,7 @@ process glam2 { script: """ - glam2 n ${fasta} -O . + glam2 n ${fasta} -O . -a 10 -b 20 -z 5 """ } @@ -128,7 +130,7 @@ process tomtom { script: """ - tomtom ${meme} ${jaspar_db} -thresh 0.01 -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + tomtom ${meme} ${jaspar_db} -thresh 0.01 -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv """ }