Skip to content

Motif estiamtion #12

Merged
merged 14 commits into from
Dec 20, 2018
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 52 additions & 22 deletions bin/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,28 +1,58 @@
#!/usr/bin/env Rscript
library("optparse")

# Splitting BED-files depending on their cluster.
# The Sequences of each cluster are writen as an FASTA-file.
# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8
# @parameter prefix <string> prefix for filenames
# @parameter min_seq <INT> min. number of sequences per cluster
option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"),
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer")
)

args = commandArgs(trailingOnly = TRUE)
opt_parser <- OptionParser(option_list = option_list,
description = "Convert BED-file to one FASTA-file per cluster")

bedInput <- args[1]
prefix <- args[2]
min_seq <- args[3]
opt <- parse_args(opt_parser)

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
print(nrow(clust))
if (nrow(clust) >= as.numeric(min_seq) ) {
sequences <- as.list(clust[[10]]) # <---- sequenze column
outfile <- paste0(prefix,"_cluster_",i,".FASTA")
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column
} else {
print(paste0("Cluster: ",i," is to small"))
#' Splitting BED-files depending on their cluster.
#' The Sequences of each cluster are writen as an FASTA-file.
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns:
#' Sequence: second last column; Cluster ID: last column
#' @param prefix <string> prefix for filenames
#' @param min_seq <INT> min. number of sequences per cluster
#'
#' @author René Wiegandt
#' @contact rene.wiegandt(at)mpi-bn.mpg.de
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){

if(is.null(bedInput)){
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.")
}
})

bed <- data.table::fread(bedInput, header = FALSE, sep = "\t")

# Get last column of data.table, which refers to the cluster, as a vector.
cluster_no <- as.vector(bed[[ncol(bed)]])

# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# For each data.frame(cluster) in list clusters:
discard <- lapply(1:length(clusters), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
# Get second last column, which contains the nucleotide sequences
sequences <- as.list(clust[[ncol(clust) - 1]])
# Create filename
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
}
})
}

# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
bed_to_fasta(opt$input, opt$prefix, opt$min_seq)
}
54 changes: 46 additions & 8 deletions bin/get_best_motif.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,64 @@
# parses arguments using argparse
# @return args list of all parameters
'''
parses arguments using argparse
@return args list of all parameters
'''
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("meme", help="Path to meme file")
parser = argparse.ArgumentParser(description='A script to convert from GLAM2 output to MEME-format and parsing only the [num] first motifs from file to the output.')
parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
parser.add_argument("output", help="Output file")
parser.add_argument("num", help="Number of motifs parsed from file")
args = parser.parse_args()
return args

# write lines of file till certain line (MOTIF + [num])
'''
The script has to functions:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

two

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed: 8389226

1. Writing lines of file till certain line (MOTIF + [num])
2. Converting GLAM2 output to minimal meme-format
@params meme STING Path to 'meme' file generated from Meme suite
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STRING

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed: 8389226

@parmas output STING Output file
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

STRING

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed: 8389226

@params num INT Number of motifs parsed from file

@author René Wiegandt
@contact rene.wiegandt(at)mpi-bn.mpg.de
'''
def main():

args = parse_arguments()
out = open(args.output, "w+")

'''
Create pattern where script should stop writing
For Example:
If num == 3, which means that you want the first/best 3 Motifs, the script
should stop writing lines to output if loop reaches line 'MOTIF 4'
'''
number = int(args.num) + 1
motif = "MOTIF " + str(number)
break_header = "MOTIF " + str(number)

# Pattern for motif header
pattern = re.compile("^MOTIF\s{2}(\d)+")
# Init count
count = 0

with open(args.meme) as f:
for line in f:
if motif in line:
## do not write [count] lines after each header -> needed for meme-format
if count > 0:
count-=1
continue
if pattern.match(line):
# if line is a motif header
count = 2
##

if break_header in line:
# line matches breaking_header, e.g. 'MOTIF 4'
break
out.write(line)
else:
out.write(line)


if __name__ == "__main__":
import argparse
import re
main()
2 changes: 1 addition & 1 deletion pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ process bed_to_clustered_fasta {

script:
"""
Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq}
Rscript ${path_bin}/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq}
"""
}

Expand Down