-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from loosolab/motif_estiamtion
Motif estiamtion
- Loading branch information
Showing
5 changed files
with
145 additions
and
71 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,58 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# Splitting BED-files depending on their cluster. | ||
# The Sequences of each cluster are writen as an FASTA-file. | ||
# @parameter bedInput <string> BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 | ||
# @parameter prefix <string> prefix for filenames | ||
# @parameter min_seq <INT> min. number of sequences per cluster | ||
|
||
args = commandArgs(trailingOnly = TRUE) | ||
|
||
bedInput <- args[1] | ||
prefix <- args[2] | ||
min_seq <- args[3] | ||
|
||
bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") | ||
|
||
clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column | ||
discard <- lapply(1:length(clusters), function(i){ | ||
clust <- as.data.frame(clusters[i]) | ||
print(nrow(clust)) | ||
if (nrow(clust) >= as.numeric(min_seq) ) { | ||
sequences <- as.list(clust[[10]]) # <---- sequenze column | ||
outfile <- paste0(prefix,"_cluster_",i,".FASTA") | ||
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column | ||
} else { | ||
print(paste0("Cluster: ",i," is to small")) | ||
} | ||
}) | ||
#!/usr/bin/env Rscript | ||
library("optparse") | ||
|
||
option_list <- list( | ||
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster_id.", metavar = "character"), | ||
make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), | ||
make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") | ||
) | ||
|
||
opt_parser <- OptionParser(option_list = option_list, | ||
description = "Convert BED-file to one FASTA-file per cluster") | ||
|
||
opt <- parse_args(opt_parser) | ||
|
||
#' Splitting BED-files depending on their cluster. | ||
#' The Sequences of each cluster are written as an FASTA-file. | ||
#' @param bedInput <string> BED-file with sequences and cluster-id as last two columns: | ||
#' Sequence: second last column; Cluster ID: last column | ||
#' @param prefix <string> prefix for filenames | ||
#' @param min_seq <INT> min. number of sequences per cluster | ||
#' | ||
#' @author René Wiegandt | ||
#' @contact rene.wiegandt(at)mpi-bn.mpg.de | ||
bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ | ||
|
||
if (is.null(bedInput)) { | ||
stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") | ||
} | ||
|
||
bed <- data.table::fread(bedInput, sep = "\t") | ||
|
||
# Get last column of data.table, which refers to the cluster, as a vector. | ||
cluster_no <- as.vector(bed[[ncol(bed)]]) | ||
|
||
# Split data.table bed on its last column (cluster_no) into list of data.frames | ||
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) | ||
|
||
# For each data.frame(cluster) in list clusters: | ||
discard <- lapply(1:length(clusters), function(i){ | ||
clust <- as.data.frame(clusters[i]) | ||
# Filter data.tables(clusters), which are to small | ||
if (nrow(clust) >= as.numeric(min_seq) ) { | ||
# Get second last column, which contains the nucleotide sequences | ||
sequences <- as.list(clust[[ncol(clust) - 1]]) | ||
# Create filename | ||
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") | ||
# Write fasta file | ||
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) | ||
} else { | ||
print(paste0("Cluster: ",i," is to small")) | ||
} | ||
}) | ||
} | ||
|
||
# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) | ||
if (!interactive()) { | ||
bed_to_fasta(opt$input, opt$prefix, opt$min_seq) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,64 @@ | ||
# parses arguments using argparse | ||
# @return args list of all parameters | ||
''' | ||
parses arguments using argparse | ||
@return args list of all parameters | ||
''' | ||
def parse_arguments(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("meme", help="Path to meme file") | ||
parser = argparse.ArgumentParser(description='A script to convert from GLAM2 output to MEME-format and parsing only the [num] first motifs from file to the output.') | ||
parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2") | ||
parser.add_argument("output", help="Output file") | ||
parser.add_argument("num", help="Number of motifs parsed from file") | ||
args = parser.parse_args() | ||
return args | ||
|
||
# write lines of file till certain line (MOTIF + [num]) | ||
''' | ||
The script has two functions: | ||
1. Writing lines of file till certain line (MOTIF + [num]) | ||
2. Converting GLAM2 output to minimal meme-format | ||
@params meme STRING Path to 'meme' file generated from Meme suite | ||
@parmas output STRING Output file | ||
@params num INT Number of motifs parsed from file | ||
@author René Wiegandt | ||
@contact rene.wiegandt(at)mpi-bn.mpg.de | ||
''' | ||
def main(): | ||
|
||
args = parse_arguments() | ||
out = open(args.output, "w+") | ||
|
||
''' | ||
Create pattern where script should stop writing | ||
For Example: | ||
If num == 3, which means that you want the first/best 3 Motifs, the script | ||
should stop writing lines to output if loop reaches line 'MOTIF 4' | ||
''' | ||
number = int(args.num) + 1 | ||
motif = "MOTIF " + str(number) | ||
break_header = "MOTIF " + str(number) | ||
|
||
# Pattern for motif header | ||
pattern = re.compile("^MOTIF\s{2}(\d)+") | ||
# Init count | ||
count = 0 | ||
|
||
with open(args.meme) as f: | ||
for line in f: | ||
if motif in line: | ||
## do not write [count] lines after each header -> needed for meme-format | ||
if count > 0: | ||
count-=1 | ||
continue | ||
if pattern.match(line): | ||
# if line is a motif header | ||
count = 2 | ||
## | ||
|
||
if break_header in line: | ||
# line matches breaking_header, e.g. 'MOTIF 4' | ||
break | ||
out.write(line) | ||
else: | ||
out.write(line) | ||
|
||
|
||
if __name__ == "__main__": | ||
import argparse | ||
import re | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters