Skip to content

Commit

Permalink
Merge pull request #47 from loosolab/estimation_motifs
Browse files Browse the repository at this point in the history
2.2 creates json file containing motif sequences
  • Loading branch information
HendrikSchultheis authored Jan 10, 2019
2 parents 13a610f + b70a38b commit 428302a
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 44 deletions.
20 changes: 17 additions & 3 deletions bin/2.2_motif_estimation/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"),
Expand Down Expand Up @@ -36,8 +36,11 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# Get number of all clusters
num_clusters <- length(clusters)

# For each data.frame(cluster) in list clusters:
discard <- lapply(seq_len(length(clusters)), function(i){
discard <- vapply(seq_len(length(clusters)), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
Expand All @@ -47,10 +50,21 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
return(TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
return(FALSE)
}
})
}, FUN.VALUE=logical(1))

# Get number of clusters which contain enought sequences
count_clust <- length(discard[discard == FALSE])

# Write log-file
write(paste0( "------------------------------------------------------------\nfile: ",
bedInput,"\nNumber of all clusters: ", num_clusters, "\nRemoved small clusters ( < ",
min_seq," sequences ): " ,count_clust , "\nRemaining number of clusters: ", num_clusters - count_clust),
file = "bed_to_fasta.log", append = T)
}

# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
Expand Down
27 changes: 17 additions & 10 deletions bin/2.2_motif_estimation/get_best_motif.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python

'''
parses arguments using argparse
@return args list of all parameters
Expand All @@ -7,6 +9,7 @@ def parse_arguments():
parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
parser.add_argument("output", help="Output file")
parser.add_argument("num", help="Number of motifs parsed from file")
parser.add_argument("id", help="Cluster ID")
args = parser.parse_args()
return args

Expand Down Expand Up @@ -35,27 +38,31 @@ def main():
number = int(args.num) + 1
break_header = "MOTIF " + str(number)

# Get cluster_id
cluster_id = args.id

# Pattern for motif header
pattern = re.compile("^MOTIF\s{2}(\d)+")
# Init count
count = 0

with open(args.meme) as f:
for line in f:
## do not write [count] lines after each header -> needed for meme-format
if count > 0:
count-=1
continue
if pattern.match(line):
# if line is a motif header
count = 2
##

if break_header in line:
# line matches breaking_header, e.g. 'MOTIF 4'
break
else:
out.write(line)
## do not write [count] lines after each header -> needed for meme-format
if count > 0:
count-=1
continue
if pattern.match(line):
# if line is a motif header
count = 2
out.write(line.strip('\n').replace(" "," ") + " Cluster_" + cluster_id + '\n')
##
else:
out.write(line)


if __name__ == "__main__":
Expand Down
85 changes: 85 additions & 0 deletions bin/2.2_motif_estimation/get_motif_seq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env Rscript
if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"),
make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric"),
make_option(opt_str = c("-c", "--cluster_id"), default = "./" , help = "Cluster ID", metavar = "numeric"),
make_option(opt_str = c("-t", "--tmp"), default = "./" , help = "Path for tmp-files. Default = '%default'", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Creating JSON-file with sequence ids which were used to create the best (num) motifs.",
epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

#' Reading files with fread.
#' Only read the first column.
#' @param path Path to file
#' @return first column as vector
read_data <- function(path){

f <- data.table::fread(path, select = 1)
return(f[[1]])
}


#' Creating JSON-file with sequence ids which were used to create the best (num) motifs.
#'
#' @param input Input file.Output txt-file from GLAM2.
#' @param output Output JSON-file
#' @param num Get best (num) motifs.
#'
#' @author René Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
create_seq_json <- function(input, output, num, tmp_path, cluster_id) {

if (!file.exists(input)) {
stop(paste0("Input file does not exists. Please check the given path: ", input))
}

if ( !is.numeric(num)) {
stop("Parameter num needs to be an integer")
}

if (num > 10 || num <= 0 ) {
stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num))
}

if ( !varhandle::check.numeric(cluster_id)) {
stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt"))
}

dir.create(tmp_path, showWarnings = FALSE)

file_dir <- tmp_path

# Split glam.txt file on lines that start with Score:
system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder"))
# Only keep the lines that start with 'f' to get the lines with the sequence ids
system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done"))

# Getting the filepaths of first 3 files with sequence ids
fnames <- file.path(file_dir,dir(file_dir, pattern = "done"))

# Running read_data on files
datalist <- lapply(fnames, read_data)

# Create json file
## naming
names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
## creating json object
json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
## writing file
write(json, file = output )
}

# run function create_seq_json with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
create_seq_json(opt$input, opt$output, opt$num, opt$tmp, opt$cluster_id)
}
}
1 change: 1 addition & 0 deletions bin/2.2_motif_estimation/motif_estimation.nf
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env nextflow

params.bed = ""
params.out = ""
Expand Down
2 changes: 2 additions & 0 deletions masterenv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ dependencies:
- r-stringi
- r-stringr
- r-optparse
- r-RJSONIO
- r-varhandle
- bioconductor-iranges
- moods
- biopython
Expand Down
Loading

0 comments on commit 428302a

Please sign in to comment.