Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
renewiegandt committed Dec 14, 2018
1 parent 748fc0b commit 7aaf90c
Showing 6 changed files with 66 additions and 36 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -86,6 +86,7 @@ Optional arguments:
--tomtom_treshold float Threshold for similarity score. (Default: 0.01)
Motif clustering:
--cluster_motif BOOLEAN IF its 1 motifs will be clustered (Default: 0)
--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
--motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001)
12 changes: 7 additions & 5 deletions bin/compareBed.sh
Original file line number Diff line number Diff line change
@@ -148,7 +148,7 @@ then
# mkdir workdir1337
#fi
wo=true
workdir="."
workdir=$path
fi

if [ $ou == false ]
@@ -223,7 +223,7 @@ echo calc maxScore
Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed

#3 subtract overlapping regions for additional motifs
echo "also get subsequences with no overlap of overlapping sequences"
#echo "also get subsequences with no overlap of overlapping sequences"
help=true

if [ -d $"motifs" ]
@@ -265,11 +265,13 @@ fi
#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
# also creates a small output file with information about the comparison

Rscript --vanilla $path/merge.R $min $max "$workdir" $data

Rscript --vanilla $path/merge.R $min $max $workdir $data

echo "------------------------------------------------------------"
echo $workdir
head -100 $workdir/merged.bed
#5. add fasta sequences to bed and create fasta file
bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$output"
bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output
bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta

#6 clean up
3 changes: 1 addition & 2 deletions bin/merge.R
Original file line number Diff line number Diff line change
@@ -22,7 +22,7 @@ splitted$length=splitted$stop - splitted$start
splitted=cbind(splitted, containsMaxpos=0)
splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1
splitted$maxpos = splitted$maxpos - splitted$start
fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')
data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t')

before = fread(args[4], header=FALSE)

@@ -34,4 +34,3 @@ lengthb = formatC(mean(before$V3-before$V2), digits=4)
lengtha = formatC(mean(splitted$length), digits=4)
stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha)
write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t')

2 changes: 1 addition & 1 deletion config/cluster.config
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@ params{
global=0
identity=0.8
sequence_coverage=8
memory=800
memory=2000
throw_away_seq=9
strand=0
}
5 changes: 3 additions & 2 deletions config/motif_estimation.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
params {
//bed_to_clustered_fasta
min_seq = 10
min_seq = 50

//glam2
motif_min_len = 8
@@ -11,7 +11,8 @@ params {
tomtom_treshold = 0.01

//motif Clustering
edge_weight = 5
cluster_motif = 0
edge_weight = 50
motif_similarity_thresh = 0.00001
best_motif = 3
}
79 changes: 53 additions & 26 deletions pipeline.nf
Original file line number Diff line number Diff line change
@@ -34,7 +34,7 @@ params.tfbs_path=""

//motif_estimation
//bed_to_clustered_fasta
params.min_seq = 10 // Minimum number of sequences in the fasta-files for glam2
params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2

//glam2
params.motif_min_key = 8 // Minimum number of key positions (aligned columns)
@@ -45,7 +45,8 @@ params.tfbs_path=""
params.tomtom_treshold = 0.01 // threshold for similarity score.

//cluster motifs
params.edge_weight = 5 // Minimum weight of edges in motif-cluster-graph
params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not
params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph
motif_similarity_thresh = 0.00001 // threshold for motif similarity score

params.best_motif = 3 // Top n motifs per cluster
@@ -129,8 +130,8 @@ process footprint_extraction {
conda "${path_env}"

tag{name}
publishDir "${out}/footprint_extraction/", mode: 'copy', pattern: '*.bed'
publishDir "${out}/footprint_extraction/log", mode: 'copy', pattern: '*.log'
publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed'
publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log'

input:
set name, file (bigWig), file (bed) from footprint_in
@@ -152,7 +153,7 @@ process extract_known_TFBS {

conda "${path_env}"

publishDir "${out}/known_TFBS/", mode: 'copy', pattern: '*.bed'
publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed'

input:
file (fasta) from fa_overlap
@@ -187,28 +188,30 @@ if(params.tfbs_path == "") {
*/
process overlap_with_known_TFBS {
conda "${path_env}"
echo true

publishDir "${out}/unknown_overlap/", mode: 'copy', pattern: '*.bed'
publishDir "${params.out}/unknown_overlap/"

input:
set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap

output:
set name, file ("${name}.bed") into bed_for_reducing
set name, file ("${name}.bed") into testetst
set name, file ("*.bed") into bed_for_reducing
//set name, file ("${name}_un.bed") into testetst

script:
motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"")
"""
${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin}
${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_un.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin}
"""
}
testetst.println()
//testetst.println()

/*
*/
process reduce_bed {
conda "${path_env}"
echo true

input:
set name, file (bed) from bed_for_reducing
@@ -227,8 +230,9 @@ process reduce_bed {
*/
process clustering {
conda "${path_env}"
echo true

publishDir "${out}/cluster/", mode: 'copy', pattern: '*.bed'
publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed'

input:
set name, file (bed) from bed_for_clustering
@@ -254,9 +258,6 @@ process bed_to_clustered_fasta {
input:
set name, file (bed) from bed_for_motif_esitmation

when:
params.fasta == false

output:
file ('*.FASTA') into fasta_for_glam2
file ('*.FASTA') into fasta_for_motif_cluster
@@ -280,16 +281,19 @@ Generating Motifs through alignment and scoring best local matches.
process glam2 {

tag{name}
publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy'

input:
set name, file (fasta) from fasta_for_glam2

output:
file("${name}/*.meme") into meme_to_merge
set name, file("${name}/*.meme") into meme_for_tomtom
set name, file("${name}/*.meme") into meme_for_filter

script:
"""
glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
"""
}

@@ -299,12 +303,17 @@ The paths are sorted numerically depending on the cluster number.
*/
process merge_meme {

publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy'

input:
val (memelist) from meme_to_merge.toList()

output:
file ('merged_meme.meme') into merged_meme

when:
params.cluster_motif == 1

script:
memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") }
meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer }
@@ -321,13 +330,16 @@ Output table has the information which clusters are similar to each other.
*/
process find_similar_motifs {


publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy'
input:
file (merged_meme) from merged_meme

output:
file ('all_to_all.tsv') into motif_similarity

when:
params.cluster_motif == 1

script:
"""
tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv
@@ -340,17 +352,21 @@ files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster)

process merge_fasta {
conda "${path_env}"
publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy'

input:
set file (motiv_sim), val (fasta_list) from files_for_merge_fasta

output:
file ('*.fasta') into motif_clustered_fasta_list
file('*.png')
file('*.png')

when:
params.cluster_motif == 1

script:
fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer }
fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"")
"""
Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight}
"""
@@ -362,7 +378,7 @@ motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten()

process clustered_glam2 {

publishDir "${out}/esimated_motifs/${name}/", mode: 'copy'
publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy'

input:
file (fasta) from motif_clustered_fasta_flat
@@ -372,13 +388,24 @@ process clustered_glam2 {
set name, file ('*.meme') into clustered_meme_for_filter
file('*')

when:
params.cluster_motif == 1

script:
name = fasta.getBaseName()
name = fasta.getBaseName()
"""
glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration}
"""
}

if(params.cluster_motif == 1){
for_tomtom = clustered_meme_for_tomtom
for_filter = clustered_meme_for_filter
} else {
for_tomtom = meme_for_tomtom
for_filter = meme_for_filter
}


/*
Running Tomtom on meme-files generated by GLAM2.
@@ -387,10 +414,10 @@ Tomtom searches motifs in databases.
process tomtom {

tag{name}
publishDir "${out}/esimated_motifs/tomtom/", mode: 'copy'
publishDir "${params.out}/final_esimated_motifs/tomtom/", mode: 'copy'

input:
set name, file (meme) from clustered_meme_for_tomtom
set name, file (meme) from for_tomtom

output:
set name, file ('*.tsv') into tsv_for_filter
@@ -404,8 +431,8 @@ process tomtom {

//Joining channels with meme and tsv files. Filter joined channel on line count.
//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel.
for_filter = clustered_meme_for_filter.join( tsv_for_filter )
for_filter
for_filter2 = for_filter.join( tsv_for_filter )
for_filter2
.filter { name, meme, tsv ->
long count = tsv.readLines().size()
count <= 1
@@ -434,7 +461,7 @@ process check_for_unknown_motifs {
process get_best_motif {
conda "${path_env}"

publishDir "${out}/esimated_motifs/unknown_motifs/", mode: 'copy'
publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy'

input:
set name, file(meme), file(tsv) from meme_for_scan

0 comments on commit 7aaf90c

Please sign in to comment.