Merge branch 'dev' of https://github.molgen.mpg.de/loosolab/masterJLU…

…2018 into dev
loosolab · Dec 10, 2018 · b45f676 · b45f676
2 parents e861fdc + 250ee92
commit b45f676
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 
 De novo motif discovery and evaluation based on footprints identified by TOBIAS
 
+For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki)
+
 ## Dependencies
 * [conda](https://conda.io/docs/user-guide/install/linux.html)
 * [Nextflow](https://www.nextflow.io/)
@@ -64,6 +66,7 @@ Optional arguments:
 	--aprox_motif_len INT Motif length (Default: 10)
 	--motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
 	--min_seq_length INT Remove all sequences below this value. (Default: 10)
+
 	Clustering:
 	--global INT Global (=1) or local (=0) alignment. (Default: 0)
 	--identity FLOAT Identity threshold. (Default: 0.8)
@@ -78,11 +81,15 @@ Optional arguments:
 	--interation INT	Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000)
 	--tomtom_treshold float	Threshold for similarity score. (Default: 0.01)
 
+	Moitf clustering:
+	--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
+	--motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001)
+
 	Creating GTF:
 	--organism [homo_sapiens | mus_musculus]
 	--tissues
-  
- All arguments can be set in the configuration files.
+
+All arguments can be set in the configuration files.
  ```
 
 

diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R
@@ -1,28 +1,28 @@
-#!/usr/bin/env Rscript
-
-# Splitting BED-files depending on their cluster.
-# The Sequences of each cluster are writen as an FASTA-file.
-# @parameter bedInput <string> BED-file with sequences and cluster-id as column"TEs
-# @parameter prefix <string> prefix for filenames
-# @parameter min_seq <INT> min. number of sequences per cluster
-
-args = commandArgs(trailingOnly = TRUE)
-
-bedInput <- args[1]
-prefix <- args[2] 
-min_seq <- args[3] 
-
-bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
-
-clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
-discard <- lapply(1:length(clusters), function(i){
-  clust <- as.data.frame(clusters[i])
-  print(nrow(clust))
-  if (nrow(clust) >= as.numeric(min_seq) ) {
-    sequences <- as.list(clust[[7]])   # <---- Splate mit Sequenz
-    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
-    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
-  } else {
-    print(paste0("Cluster: ",i," is to small"))
-  }
-})
+#!/usr/bin/env Rscript
+
+# Splitting BED-files depending on their cluster.
+# The Sequences of each cluster are writen as an FASTA-file.
+# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
+# @parameter prefix <string> prefix for filenames
+# @parameter min_seq <INT> min. number of sequences per cluster
+
+args = commandArgs(trailingOnly = TRUE)
+
+bedInput <- args[1]
+prefix <- args[2] 
+min_seq <- args[3] 
+
+bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")
+
+clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
+discard <- lapply(1:length(clusters), function(i){
+  clust <- as.data.frame(clusters[i])
+  print(nrow(clust))
+  if (nrow(clust) >= as.numeric(min_seq) ) {
+    sequences <- as.list(clust[[7]])   # <---- Splate mit Sequenz
+    outfile <- paste0(prefix,"_cluster_",i,".FASTA")
+    seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
+  } else {
+    print(paste0("Cluster: ",i," is to small"))
+  }
+})
diff --git a/bin/compareBed.sh b/bin/compareBed.sh
@@ -96,7 +96,7 @@ if [ $he == true ]
 then
 	echo "This script utilies bedtools to select new footprints from data."
 	echo "Therefore the data is compared with known footprint motifs."
-	echo "The output is a new .bed file with added sequence information."
+	echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)"
 	echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences."
 	echo "If a parameter is chosen, a value must be provided or an error will occur."
 	echo "--------------------"
@@ -110,7 +110,7 @@ then
 	echo "              "
 	echo "              optional parameter:"
 	echo "              -w   --workdir      the path to directory where temporary files will be created"
-	echo "                                  default is a new directory that is created in the current directory"
+	echo "                                  default is the current directory"
 	echo "              -min --min          minimum size of footprints; default is 10"
 	echo "              -max --max          maximum size of footprints; default is 80"
 	echo "              -o   --output       output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta"
@@ -258,6 +258,7 @@ else
 fi
 
 #4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition 
+# also creates a small output file with information about the comparison
 
 Rscript --vanilla /bin/merge.R $min $max "$workdir" $data
 

diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py
@@ -3,21 +3,24 @@
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument("meme", help="Path to meme file")
-    parser.add_argument("output", help="")
+    parser.add_argument("output", help="Output file")
+    parser.add_argument("num", help="Number of motifs parsed from file")
     args = parser.parse_args()
     return args
 
 
 def main():
     args = parse_arguments()
     out = open(args.output, "w+")
+    number = int(args.num) + 1
+    motif = "MOTIF " + str(number)
     with open(args.meme) as f:
         for line in f:
-            if 'MOTIF  2' in line:
+            if motif in line:
                 break
             out.write(line)
 
 
 if __name__ == "__main__":
     import argparse
-    main()
+    main()
diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R
@@ -0,0 +1,57 @@
+#!/usr/bin/env Rscript
+
+# Merging FASTA-files, which motifs are similar.
+# 
+# @parameter tsv_in <string> Path to TSV file generated by Tomtom.
+#                            The input for Tomtom is a from all clusters merged meme-file.
+# @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
+# @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.
+
+
+args = commandArgs(trailingOnly = TRUE)
+
+tsv_in <- args[1] 
+file_list <- args[2]
+min_weight <- args[3]
+
+files <- unlist(as.list(strsplit(file_list, ",")))
+
+# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
+tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
+query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
+  tail(l,n=1)
+}))
+target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
+  tail(l,n=1)
+}))
+
+# create data.table with only the cluster-numbers
+sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
+# convert from character to numeric values
+sim_not_unique[, query_cluster := as.numeric(query_cluster)]
+sim_not_unique[, target_cluster := as.numeric(target_cluster)]
+
+# remove rows if column 1 is idential to column 2
+edgelist <- sim_not_unique[query_cluster != target_cluster]
+
+# create graph from data.frame
+g <- igraph::graph_from_edgelist(as.matrix(edgelist))
+# converting graph to adjacency matrix
+adj_matrix <- igraph::get.adjacency(g, names = T)
+# generating weighted graph from adjacency matrix 
+g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)
+
+# get subgraphs from graph with edges of weight > min_weight
+s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
+png('motif_clusters.png')
+plot(s1)
+dev.off()
+clust <- igraph::clusters(s1)
+
+# merge FASTA-files depending on the clustered graphs
+a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
+  cl <- as.vector(which(clust$membership %in% c(i)))
+  fasta_list <- paste(files[cl], collapse = " ")
+  name <- paste0("Cluster_",i,".fasta")
+  system(paste("cat",fasta_list,">",name))
+})
diff --git a/config/motif_estimation.config b/config/motif_estimation.config
@@ -9,4 +9,9 @@ params {
 
 	//tomtom
 	tomtom_treshold = 0.01
+
+	//motif Clustering
+	edge_weight = 5
+	motif_similarity_thresh = 0.00001
+	best_motif = 3
 }