Skip to content

Motif estiamtion #1

Merged
merged 8 commits into from
Nov 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 13 additions & 6 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,25 @@
# The Sequences of each cluster are writen as an FASTA-file.
# @parameter bedInput <string> BED-file with sequences and cluster-id as column"TEs
# @parameter prefix <string> prefix for filenames
# @parameter min_seq <INT> min. number of sequences per cluster

args = commandArgs(trailingOnly = TRUE)

bedInput <- args[1]
prefix <- args[2] # "Fasta"
prefix <- args[2]
min_seq <- args[3]

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

clusters <- split(bed, bed$V3, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
discard <- lapply(1:length(clusters), function(i){
sequences <- as.list(as.data.frame(clusters[i])[[2]]) # <---- Splate mit Sequenz
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = as.data.frame(clusters[i])[[1]]
, file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
clust <- as.data.frame(clusters[i])
print(nrow(clust))
if (nrow(clust) >= as.numeric(min_seq) ) {
sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
183 changes: 183 additions & 0 deletions bin/compareBed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#!/bin/bash
#data
#motifs
#workdir
#fasta
#min
#max
#output
wrong=()
da=false
mo=false
wo=false
fa=false
mi=false
ma=false
ou=false
he=false

if [ $# -eq 0 ]
then
he=true
fi

while [[ $# -gt 0 ]]
do
key="$1"

case $key in
-d|--data)
data="$2"
da=true
shift
shift
;;
-m|--motifs)
motifs="$2"
mo=true
shift
shift
;;
-w|--workdir)
workdir="$2"
wo=true
shift
shift
;;
-f|--fasta)
fasta="$2"
fa=true
shift
shift
;;
-min|--min)
min=$2
mi=true
shift
shift
;;
-max|--max)
max=$2
ma=true
shift
shift
;;
-o|--output)
output="$2"
ou=true
shift
shift
;;
-h|--help)
help=true
he=true
shift
;;
*)
wrong+=("$1")
shift
;;
esac
done

count=${#wrong[@]}
if [ $count -gt 0 ]
then
for i in ${wrong[@]}
do
echo wrong parameter $i
echo call script without parameters for help or call --help
echo exit
done
exit 1
fi

if [ $he == true ]
then
echo "This script utilies bedtools to select new footprints from data."
echo "Therefore the data is compared with known footprint motifs."
echo "The output is a new .bed file with added sequence information."
echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences."
echo "If a parameter is chosen, a value must be provided or an error will occur."
echo "--------------------"
echo "usage: compareBed.sh --data \<path/to/file.bed\> --motifs \<path/to/known/motifs.bed\> --fasta \<path/to/genome.fasta\> \[optional_parameter \<value\>\] ..."
echo "--------------------"
echo " required parameter:"
echo " -d --data the path to the .bed file of the footprints"
echo " -m --motifs the path to the .bed file of the scanned motifs"
echo " -f --fasta the path to the .fasta file of genome"
echo " "
echo " optional parameter:"
echo " -w --workdir the path to directory where temporary files will be created"
echo " default is a new directory that is created in the current directory"
echo " -min --min minimum size of footprints\; default is 10"
echo " -max --max maximum size of footprints\; default is 60"
echo " -o --output output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta"
echo " -h --help shows this help message"
exit 0
fi

echo selected parameters
echo -------------------
echo data: $da \(required\)
echo motifs: $mo \(required\)
echo workdir: $wo
echo fasta: $fa \(required\)
echo min: $mi
echo max: $ma
echo output: $ou
echo help: $he

if [ $da == false ] || [ $mo == false ] || [ $fa == false ]
then
echo required parameters not given.
echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\>
exit 1
fi

if [ $wo == false ]
then
if [ ! -d workdir1337 ]
then
mkdir workdir1337
fi
wo=true
workdir="workdir1337"
fi

if [ $ou == false ]
then
output="newFootprints.bed"
ou=true
fi

if [ $mi == false ]
then
min=10
mi=true
fi

if [ $ma == false ]
then
max=60
ma=true
fi
#1. first filter. no overlap vs. overlap
bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed
bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed

#2. compute absolut maxscore position
Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed

#3. subtract overlapping regions for additional motifs
bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed

#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
Rscript --vanilla merge.R $min $max "$workdir"

#5. add fasta sequences to bed and create fasta file
bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output"
bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta

#6 clean up
rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed
9 changes: 9 additions & 0 deletions bin/maxScore.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/home/jhamp/.conda/envs/tfbs/bin/Rscript
args = commandArgs(TRUE)
file = args[1]

tab = read.table(file, header=FALSE)
colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
tab$maxpos = tab$start + tab$maxpos

write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
23 changes: 23 additions & 0 deletions bin/merge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/home/jhamp/.conda/envs/tfbs/bin/Rscript
args=commandArgs(TRUE)
min=as.numeric(args[1])
max=as.numeric(args[2])
folder=args[3]
splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE)
colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE)
colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
p1$maxpos = p1$start + p1$maxpos

splitted=rbind(splitted, p1)

splitted=splitted[which(splitted$stop - splitted$start >= min),]
splitted=splitted[which(splitted$stop - splitted$start <= max),]
splitted$id=make.unique(as.character(splitted$id))
splitted$length=splitted$stop - splitted$start

splitted=cbind(splitted, containsMaxpos=0)
splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
splitted$maxpos = splitted$maxpos - splitted$start
write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')

41 changes: 30 additions & 11 deletions pipeline.nf
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
//!/usr/bin/env nextflow

Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input}
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan}
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom}
Channel.fromPath(params.config).set {config}

params.min_seq = 10

process footprint_extraction {

Expand All @@ -12,14 +15,15 @@ process footprint_extraction {
set name, file (bigWig) from bigwig_input

output:
set name, file ('*.bed') into bed_for_overlapp_with_TFBS
set name, file ('*.bed') into bed_for_overlap_with_TFBS

script:
"""
python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --threshold ${params.threshold_footprint_extraction}
"""
}


//Abfrage ob ausgeführt werden muss.
process extract_known_TFBS {

input:
Expand All @@ -35,16 +39,20 @@ process extract_known_TFBS {
}


bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap}


process overlap_with_known_TFBS {

input:
file (TFBS) from known_TFBS_for_overlap
set file (bed_footprints), file (bed_motifs), file (fasta) from for_overlap

output:
file ('*.FASTA') into FASTA_for_clustering

script:
"""
${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${bed_motifs} --fasta ${fasta} -o ${name_placeholder} -min ${params.min_size_fp} -max ${params.max_size_fp}
"""
}

Expand All @@ -53,7 +61,7 @@ process overlap_with_known_TFBS {
process clustering {

input:
file (faste) from FASTA_for_clustering
file (fasta) from FASTA_for_clustering

output:
set name, file ('*.bed') into bed_for_motif_esitmation
Expand All @@ -78,7 +86,7 @@ process bed_to_clustered_fasta {

script:
"""
Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name}
Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
"""
}

Expand All @@ -101,7 +109,7 @@ process glam2 {

script:
"""
glam2 n ${fasta} -O .
glam2 n ${fasta} -O . -a 10 -b 20 -z 5
"""
}

Expand All @@ -122,12 +130,12 @@ process tomtom {

script:
"""
tomtom ${meme} ${jaspar_db} -thresh 0.01 -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
tomtom ${meme} ${jaspar_db} -thresh 0.01 -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
"""
}


//Joining channels with meme and tsv files. Filter joined channel on linecount.
//Joining channels with meme and tsv files. Filter joined channel on line count.
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
for_filter = meme_for_filter.join( tsv_for_filter )
for_filter
Expand All @@ -138,7 +146,7 @@ for_filter
.into { meme_for_scan; check }


//If unknown channel is empty print errormessage
//If channel 'check' is empty print errormessage
process check_for_unknown_motifs {
echo true

Expand All @@ -155,6 +163,7 @@ process check_for_unknown_motifs {
}


//Get the best(first) Motif from each MEME-file
process get_best_motif {

input:
Expand Down Expand Up @@ -212,15 +221,25 @@ process create_GTF {
}


process create_UROPA_config {
bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}


// Create configuration file for UROPA.
// Takes template and replaces bed- and gtf-placeholders with actual paths.
process create_uropa_config {

publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy'

input:
set val(bed), val(gtf) from uropa_in.toList()
file (conf) from config

output:
file ('uropa.config') into uropa_config

script:
"""
sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final
"""
}

Expand Down