Skip to content

Commit

Permalink
Merge pull request #1 from loosolab/motif_estiamtion
Browse files Browse the repository at this point in the history
Motif estiamtion
  • Loading branch information
renewiegandt authored Nov 29, 2018
2 parents 5a7b661 + 420cb1f commit 4b32142
Show file tree
Hide file tree
Showing 6 changed files with 274 additions and 17 deletions.
19 changes: 13 additions & 6 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,25 @@
# The Sequences of each cluster are writen as an FASTA-file.
# @parameter bedInput <string> BED-file with sequences and cluster-id as column"TEs
# @parameter prefix <string> prefix for filenames
# @parameter min_seq <INT> min. number of sequences per cluster

args = commandArgs(trailingOnly = TRUE)

bedInput <- args[1]
prefix <- args[2] # "Fasta"
prefix <- args[2]
min_seq <- args[3]

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

clusters <- split(bed, bed$V3, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
discard <- lapply(1:length(clusters), function(i){
sequences <- as.list(as.data.frame(clusters[i])[[2]]) # <---- Splate mit Sequenz
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = as.data.frame(clusters[i])[[1]]
, file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
clust <- as.data.frame(clusters[i])
print(nrow(clust))
if (nrow(clust) >= as.numeric(min_seq) ) {
sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
183 changes: 183 additions & 0 deletions bin/compareBed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#!/bin/bash
#data
#motifs
#workdir
#fasta
#min
#max
#output
wrong=()
da=false
mo=false
wo=false
fa=false
mi=false
ma=false
ou=false
he=false

if [ $# -eq 0 ]
then
he=true
fi

while [[ $# -gt 0 ]]
do
key="$1"

case $key in
-d|--data)
data="$2"
da=true
shift
shift
;;
-m|--motifs)
motifs="$2"
mo=true
shift
shift
;;
-w|--workdir)
workdir="$2"
wo=true
shift
shift
;;
-f|--fasta)
fasta="$2"
fa=true
shift
shift
;;
-min|--min)
min=$2
mi=true
shift
shift
;;
-max|--max)
max=$2
ma=true
shift
shift
;;
-o|--output)
output="$2"
ou=true
shift
shift
;;
-h|--help)
help=true
he=true
shift
;;
*)
wrong+=("$1")
shift
;;
esac
done

count=${#wrong[@]}
if [ $count -gt 0 ]
then
for i in ${wrong[@]}
do
echo wrong parameter $i
echo call script without parameters for help or call --help
echo exit
done
exit 1
fi

if [ $he == true ]
then
echo "This script utilies bedtools to select new footprints from data."
echo "Therefore the data is compared with known footprint motifs."
echo "The output is a new .bed file with added sequence information."
echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences."
echo "If a parameter is chosen, a value must be provided or an error will occur."
echo "--------------------"
echo "usage: compareBed.sh --data \<path/to/file.bed\> --motifs \<path/to/known/motifs.bed\> --fasta \<path/to/genome.fasta\> \[optional_parameter \<value\>\] ..."
echo "--------------------"
echo " required parameter:"
echo " -d --data the path to the .bed file of the footprints"
echo " -m --motifs the path to the .bed file of the scanned motifs"
echo " -f --fasta the path to the .fasta file of genome"
echo " "
echo " optional parameter:"
echo " -w --workdir the path to directory where temporary files will be created"
echo " default is a new directory that is created in the current directory"
echo " -min --min minimum size of footprints\; default is 10"
echo " -max --max maximum size of footprints\; default is 60"
echo " -o --output output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta"
echo " -h --help shows this help message"
exit 0
fi

echo selected parameters
echo -------------------
echo data: $da \(required\)
echo motifs: $mo \(required\)
echo workdir: $wo
echo fasta: $fa \(required\)
echo min: $mi
echo max: $ma
echo output: $ou
echo help: $he

if [ $da == false ] || [ $mo == false ] || [ $fa == false ]
then
echo required parameters not given.
echo required are: --data \<path/data.bed\> --motifs \<path/motifs.bed\> --fasta \<path/file.fasta\>
exit 1
fi

if [ $wo == false ]
then
if [ ! -d workdir1337 ]
then
mkdir workdir1337
fi
wo=true
workdir="workdir1337"
fi

if [ $ou == false ]
then
output="newFootprints.bed"
ou=true
fi

if [ $mi == false ]
then
min=10
mi=true
fi

if [ $ma == false ]
then
max=60
ma=true
fi
#1. first filter. no overlap vs. overlap
bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed
bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed

#2. compute absolut maxscore position
Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed

#3. subtract overlapping regions for additional motifs
bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed

#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
Rscript --vanilla merge.R $min $max "$workdir"

#5. add fasta sequences to bed and create fasta file
bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output"
bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta

#6 clean up
rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed
9 changes: 9 additions & 0 deletions bin/maxScore.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/home/jhamp/.conda/envs/tfbs/bin/Rscript
args = commandArgs(TRUE)
file = args[1]

tab = read.table(file, header=FALSE)
colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
tab$maxpos = tab$start + tab$maxpos

write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
23 changes: 23 additions & 0 deletions bin/merge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/home/jhamp/.conda/envs/tfbs/bin/Rscript
args=commandArgs(TRUE)
min=as.numeric(args[1])
max=as.numeric(args[2])
folder=args[3]
splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE)
colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE)
colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length")
p1$maxpos = p1$start + p1$maxpos

splitted=rbind(splitted, p1)

splitted=splitted[which(splitted$stop - splitted$start >= min),]
splitted=splitted[which(splitted$stop - splitted$start <= max),]
splitted$id=make.unique(as.character(splitted$id))
splitted$length=splitted$stop - splitted$start

splitted=cbind(splitted, containsMaxpos=0)
splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
splitted$maxpos = splitted$maxpos - splitted$start
write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')

41 changes: 30 additions & 11 deletions pipeline.nf
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
//!/usr/bin/env nextflow

Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input}
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan}
Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2}
Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom}
Channel.fromPath(params.config).set {config}

params.min_seq = 10

process footprint_extraction {

Expand All @@ -12,14 +15,15 @@ process footprint_extraction {
set name, file (bigWig) from bigwig_input

output:
set name, file ('*.bed') into bed_for_overlapp_with_TFBS
set name, file ('*.bed') into bed_for_overlap_with_TFBS

script:
"""
python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --threshold ${params.threshold_footprint_extraction}
"""
}


//Abfrage ob ausgeführt werden muss.
process extract_known_TFBS {

input:
Expand All @@ -35,16 +39,20 @@ process extract_known_TFBS {
}


bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap}


process overlap_with_known_TFBS {

input:
file (TFBS) from known_TFBS_for_overlap
set file (bed_footprints), file (bed_motifs), file (fasta) from for_overlap

output:
file ('*.FASTA') into FASTA_for_clustering

script:
"""
${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${bed_motifs} --fasta ${fasta} -o ${name_placeholder} -min ${params.min_size_fp} -max ${params.max_size_fp}
"""
}

Expand All @@ -53,7 +61,7 @@ process overlap_with_known_TFBS {
process clustering {

input:
file (faste) from FASTA_for_clustering
file (fasta) from FASTA_for_clustering

output:
set name, file ('*.bed') into bed_for_motif_esitmation
Expand All @@ -78,7 +86,7 @@ process bed_to_clustered_fasta {

script:
"""
Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name}
Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
"""
}

Expand All @@ -101,7 +109,7 @@ process glam2 {

script:
"""
glam2 n ${fasta} -O .
glam2 n ${fasta} -O . -a 10 -b 20 -z 5
"""
}

Expand All @@ -122,12 +130,12 @@ process tomtom {

script:
"""
tomtom ${meme} ${jaspar_db} -thresh 0.01 -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
tomtom ${meme} ${jaspar_db} -thresh 0.01 -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv
"""
}


//Joining channels with meme and tsv files. Filter joined channel on linecount.
//Joining channels with meme and tsv files. Filter joined channel on line count.
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
for_filter = meme_for_filter.join( tsv_for_filter )
for_filter
Expand All @@ -138,7 +146,7 @@ for_filter
.into { meme_for_scan; check }


//If unknown channel is empty print errormessage
//If channel 'check' is empty print errormessage
process check_for_unknown_motifs {
echo true

Expand All @@ -155,6 +163,7 @@ process check_for_unknown_motifs {
}


//Get the best(first) Motif from each MEME-file
process get_best_motif {

input:
Expand Down Expand Up @@ -212,15 +221,25 @@ process create_GTF {
}


process create_UROPA_config {
bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}


// Create configuration file for UROPA.
// Takes template and replaces bed- and gtf-placeholders with actual paths.
process create_uropa_config {

publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy'

input:
set val(bed), val(gtf) from uropa_in.toList()
file (conf) from config

output:
file ('uropa.config') into uropa_config

script:
"""
sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final
"""
}

Expand Down
Loading

0 comments on commit 4b32142

Please sign in to comment.