Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…2018 into dev
  • Loading branch information
renewiegandt committed Dec 10, 2018
2 parents e861fdc + 250ee92 commit b45f676
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 47 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

De novo motif discovery and evaluation based on footprints identified by TOBIAS

For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki)

## Dependencies
* [conda](https://conda.io/docs/user-guide/install/linux.html)
* [Nextflow](https://www.nextflow.io/)
Expand Down Expand Up @@ -64,6 +66,7 @@ Optional arguments:
--aprox_motif_len INT Motif length (Default: 10)
--motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif.
--min_seq_length INT Remove all sequences below this value. (Default: 10)
Clustering:
--global INT Global (=1) or local (=0) alignment. (Default: 0)
--identity FLOAT Identity threshold. (Default: 0.8)
Expand All @@ -78,11 +81,15 @@ Optional arguments:
--interation INT Number of iterations done by glam2. More Interations: better results, higher runtime. (Default: 10000)
--tomtom_treshold float Threshold for similarity score. (Default: 0.01)
Moitf clustering:
--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5)
--motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001)
Creating GTF:
--organism [homo_sapiens | mus_musculus]
--tissues
All arguments can be set in the configuration files.
All arguments can be set in the configuration files.
```


Expand Down
56 changes: 28 additions & 28 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
#!/usr/bin/env Rscript

# Splitting BED-files depending on their cluster.
# The Sequences of each cluster are writen as an FASTA-file.
# @parameter bedInput <string> BED-file with sequences and cluster-id as column"TEs
# @parameter prefix <string> prefix for filenames
# @parameter min_seq <INT> min. number of sequences per cluster

args = commandArgs(trailingOnly = TRUE)

bedInput <- args[1]
prefix <- args[2]
min_seq <- args[3]

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
print(nrow(clust))
if (nrow(clust) >= as.numeric(min_seq) ) {
sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
#!/usr/bin/env Rscript

# Splitting BED-files depending on their cluster.
# The Sequences of each cluster are writen as an FASTA-file.
# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
# @parameter prefix <string> prefix for filenames
# @parameter min_seq <INT> min. number of sequences per cluster

args = commandArgs(trailingOnly = TRUE)

bedInput <- args[1]
prefix <- args[2]
min_seq <- args[3]

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
print(nrow(clust))
if (nrow(clust) >= as.numeric(min_seq) ) {
sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
5 changes: 3 additions & 2 deletions bin/compareBed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ if [ $he == true ]
then
echo "This script utilies bedtools to select new footprints from data."
echo "Therefore the data is compared with known footprint motifs."
echo "The output is a new .bed file with added sequence information."
echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)"
echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences."
echo "If a parameter is chosen, a value must be provided or an error will occur."
echo "--------------------"
Expand All @@ -110,7 +110,7 @@ then
echo " "
echo " optional parameter:"
echo " -w --workdir the path to directory where temporary files will be created"
echo " default is a new directory that is created in the current directory"
echo " default is the current directory"
echo " -min --min minimum size of footprints; default is 10"
echo " -max --max maximum size of footprints; default is 80"
echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta"
Expand Down Expand Up @@ -258,6 +258,7 @@ else
fi

#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition
# also creates a small output file with information about the comparison

Rscript --vanilla /bin/merge.R $min $max "$workdir" $data

Expand Down
9 changes: 6 additions & 3 deletions bin/get_best_motif.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,24 @@
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("meme", help="Path to meme file")
parser.add_argument("output", help="")
parser.add_argument("output", help="Output file")
parser.add_argument("num", help="Number of motifs parsed from file")
args = parser.parse_args()
return args


def main():
args = parse_arguments()
out = open(args.output, "w+")
number = int(args.num) + 1
motif = "MOTIF " + str(number)
with open(args.meme) as f:
for line in f:
if 'MOTIF 2' in line:
if motif in line:
break
out.write(line)


if __name__ == "__main__":
import argparse
main()
main()
57 changes: 57 additions & 0 deletions bin/merge_similar_clusters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env Rscript

# Merging FASTA-files, which motifs are similar.
#
# @parameter tsv_in <string> Path to TSV file generated by Tomtom.
# The input for Tomtom is a from all clusters merged meme-file.
# @parameter file_list <string> Numerically sorted whitespace separated list of absolute fasta-file paths
# @parameter min_weight <INT> Minimum weight of edge allowed in graph clusters.


args = commandArgs(trailingOnly = TRUE)

tsv_in <- args[1]
file_list <- args[2]
min_weight <- args[3]

files <- unlist(as.list(strsplit(file_list, ",")))

# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster.
tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character')
query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){
tail(l,n=1)
}))
target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){
tail(l,n=1)
}))

# create data.table with only the cluster-numbers
sim_not_unique <- data.table::data.table(query_cluster,target_cluster)
# convert from character to numeric values
sim_not_unique[, query_cluster := as.numeric(query_cluster)]
sim_not_unique[, target_cluster := as.numeric(target_cluster)]

# remove rows if column 1 is idential to column 2
edgelist <- sim_not_unique[query_cluster != target_cluster]

# create graph from data.frame
g <- igraph::graph_from_edgelist(as.matrix(edgelist))
# converting graph to adjacency matrix
adj_matrix <- igraph::get.adjacency(g, names = T)
# generating weighted graph from adjacency matrix
g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T)

# get subgraphs from graph with edges of weight > min_weight
s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F)
png('motif_clusters.png')
plot(s1)
dev.off()
clust <- igraph::clusters(s1)

# merge FASTA-files depending on the clustered graphs
a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){
cl <- as.vector(which(clust$membership %in% c(i)))
fasta_list <- paste(files[cl], collapse = " ")
name <- paste0("Cluster_",i,".fasta")
system(paste("cat",fasta_list,">",name))
})
5 changes: 5 additions & 0 deletions config/motif_estimation.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,9 @@ params {

//tomtom
tomtom_treshold = 0.01

//motif Clustering
edge_weight = 5
motif_similarity_thresh = 0.00001
best_motif = 3
}
Loading

0 comments on commit b45f676

Please sign in to comment.