Skip to content

Motif estiamtion #12

Merged
merged 14 commits into from
Dec 20, 2018
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
74 changes: 52 additions & 22 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,28 +1,58 @@
#!/usr/bin/env Rscript
library("optparse")

# Splitting BED-files depending on their cluster.
# The Sequences of each cluster are writen as an FASTA-file.
# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
# @parameter prefix <string> prefix for filenames
# @parameter min_seq <INT> min. number of sequences per cluster
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)

args = commandArgs(trailingOnly = TRUE)
opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster")

bedInput <- args[1]
prefix <- args[2]
min_seq <- args[3]
opt <- parse_args(opt_parser)

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
print(nrow(clust))
if (nrow(clust) >= as.numeric(min_seq) ) {
sequences <- as.list(clust[[10]]) # <---- sequenze column
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
} else {
print(paste0("Cluster: ",i," is to small"))
#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){

if(is.null(bedInput)){
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
}
})

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])

# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
}

# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
}
54 changes: 46 additions & 8 deletions bin/get_best_motif.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,64 @@
# parses arguments using argparse
# @return args list of all parameters
'''
parses arguments using argparse
@return args list of all parameters
'''
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("meme", help="Path to meme file")
parser = argparse.ArgumentParser(description='A script to convert from GLAM2 output to MEME-format and parsing only the [num] first motifs from file to the output.')
parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
parser.add_argument("output", help="Output file")
parser.add_argument("num", help="Number of motifs parsed from file")
args = parser.parse_args()
return args

# write lines of file till certain line (MOTIF + [num])
'''
The script has to functions:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

two

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed: 8389226

1. Writing lines of file till certain line (MOTIF + [num])
2. Converting GLAM2 output to minimal meme-format
@params meme STING Path to 'meme' file generated from Meme suite
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STRING

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed: 8389226

@parmas output STING Output file
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STRING

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed: 8389226

@params num INT Number of motifs parsed from file
@author René Wiegandt
@contact rene.wiegandt(at)mpi-bn.mpg.de
'''
def main():

args = parse_arguments()
out = open(args.output, "w+")

'''
Create pattern where script should stop writing
For Example:
If num == 3, which means that you want the first/best 3 Motifs, the script
should stop writing lines to output if loop reaches line 'MOTIF 4'
'''
number = int(args.num) + 1
motif = "MOTIF " + str(number)
break_header = "MOTIF " + str(number)

# Pattern for motif header
pattern = re.compile("^MOTIF\s{2}(\d)+")
# Init count
count = 0

with open(args.meme) as f:
for line in f:
if motif in line:
## do not write [count] lines after each header -> needed for meme-format
if count > 0:
count-=1
continue
if pattern.match(line):
# if line is a motif header
count = 2
##

if break_header in line:
# line matches breaking_header, e.g. 'MOTIF 4'
break
out.write(line)
else:
out.write(line)


if __name__ == "__main__":
import argparse
import re
main()
2 changes: 1 addition & 1 deletion pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ process bed_to_clustered_fasta {

script:
"""
Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
Rscript ${path_bin}/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq}
"""
}

Expand Down