Skip to content

2.2 creates json file containing motif sequences #47

Merged
merged 20 commits into from
Jan 10, 2019
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
5b8bb7a
Merge branch 'dev' into estimation_motifs
renewiegandt Jan 8, 2019
fe5ab42
get_best_motif.py: Added alternative name to best_motif file
renewiegandt Jan 8, 2019
79b7e24
get_best_motif.py: removed whitespace from motif header
renewiegandt Jan 8, 2019
8377340
added script get_motif_seq.R
renewiegandt Jan 8, 2019
3f9e4bc
pipeline.nf: implemented get_motif_seq.R
renewiegandt Jan 8, 2019
49ec389
added r packages: RJSONIO, varhandle to masterenv.yml
renewiegandt Jan 8, 2019
98ab76a
Merge branch 'dev' into estimation_motifs
renewiegandt Jan 8, 2019
d979e16
get_best_motif.py: added parameter cluster id
renewiegandt Jan 9, 2019
ea2e171
get_motif_seq.R: added parameter tmp_path and cluster_id
renewiegandt Jan 9, 2019
8e5049e
pipeline.nf: adjusting to new parameters required by get_best_motif;…
renewiegandt Jan 9, 2019
1439045
Added log file for part 2.2_motif_estimation
renewiegandt Jan 9, 2019
cff01fc
Merge branch 'dev' into estimation_motifs
renewiegandt Jan 9, 2019
0f4bf46
added missing shebangs to 2.2 scripts #44
renewiegandt Jan 9, 2019
c0a462b
pipeline.nf: adjusting to new parameter required by 2.1 scripts
renewiegandt Jan 9, 2019
e62bb36
pipeline.nf: fixed typos
renewiegandt Jan 10, 2019
f3b3db4
get_best_motif.py: Removed additional whitespace in motif header
renewiegandt Jan 10, 2019
2d2f8cb
pipeline.nf: removed code for debuging
renewiegandt Jan 10, 2019
43c55ae
bed_to_fasta.R: changes lapply to vapply
renewiegandt Jan 10, 2019
310702e
Removed import os from get_best_motif.py
renewiegandt Jan 10, 2019
b70a38b
Removed newline from get_motif_seq
renewiegandt Jan 10, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 17 additions & 3 deletions bin/2.2_motif_estimation/bed_to_fasta.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env Rscript
if (!require(optparse)) install.packages("optparse"); library(optparse)
if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"),
Expand Down Expand Up @@ -36,8 +36,11 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
# Split data.table bed on its last column (cluster_no) into list of data.frames
clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE)

# Get number of all clusters
num_clusters <- length(clusters)

# For each data.frame(cluster) in list clusters:
discard <- lapply(seq_len(length(clusters)), function(i){
discard <- vapply(seq_len(length(clusters)), function(i){
clust <- as.data.frame(clusters[i])
# Filter data.tables(clusters), which are to small
if (nrow(clust) >= as.numeric(min_seq) ) {
Expand All @@ -47,10 +50,21 @@ bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){
outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA")
# Write fasta file
seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE)
return(TRUE)
} else {
print(paste0("Cluster: ",i," is to small"))
return(FALSE)
}
})
}, FUN.VALUE=logical(1))

# Get number of clusters which contain enought sequences
count_clust <- length(discard[discard == FALSE])

# Write log-file
write(paste0( "------------------------------------------------------------\nfile: ",
bedInput,"\nNumber of all clusters: ", num_clusters, "\nRemoved small clusters ( < ",
min_seq," sequences ): " ,count_clust , "\nRemaining number of clusters: ", num_clusters - count_clust),
file = "bed_to_fasta.log", append = T)
}

# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell)
Expand Down
28 changes: 18 additions & 10 deletions bin/2.2_motif_estimation/get_best_motif.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python

'''
parses arguments using argparse
@return args list of all parameters
Expand All @@ -7,6 +9,7 @@ def parse_arguments():
parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2")
parser.add_argument("output", help="Output file")
parser.add_argument("num", help="Number of motifs parsed from file")
parser.add_argument("id", help="Cluster ID")
args = parser.parse_args()
return args

Expand Down Expand Up @@ -35,30 +38,35 @@ def main():
number = int(args.num) + 1
break_header = "MOTIF " + str(number)

# Get cluster_id
cluster_id = args.id

# Pattern for motif header
pattern = re.compile("^MOTIF\s{2}(\d)+")
# Init count
count = 0

with open(args.meme) as f:
for line in f:
## do not write [count] lines after each header -> needed for meme-format
if count > 0:
count-=1
continue
if pattern.match(line):
# if line is a motif header
count = 2
##

if break_header in line:
# line matches breaking_header, e.g. 'MOTIF 4'
break
else:
out.write(line)
## do not write [count] lines after each header -> needed for meme-format
if count > 0:
count-=1
continue
if pattern.match(line):
# if line is a motif header
count = 2
out.write(line.strip('\n').replace(" "," ") + " Cluster_" + cluster_id + '\n')
##
else:
out.write(line)


if __name__ == "__main__":
import argparse
import re
import os
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import os is not used. Should be removed.

main()
86 changes: 86 additions & 0 deletions bin/2.2_motif_estimation/get_motif_seq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env Rscript
if (!require(optparse, quietly = T)) install.packages("optparse"); library(optparse)

option_list <- list(
make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input file. Output txt-file from GLAM2.", metavar = "character"),
make_option(opt_str = c("-o", "--output"), default = "sequences.json" , help = "Output JSON-file. Default = '%default'", metavar = "character"),
make_option(opt_str = c("-n", "--num"), default = 3 , help = "Get best (num) motifs. Default = '%default'", metavar = "numeric"),
make_option(opt_str = c("-c", "--cluster_id"), default = "./" , help = "Cluster ID", metavar = "numeric"),
make_option(opt_str = c("-t", "--tmp"), default = "./" , help = "Path for tmp-files. Default = '%default'", metavar = "character")
)

opt_parser <- OptionParser(option_list = option_list,
description = "Creating JSON-file with sequence ids which were used to create the best (num) motifs.",
epilogue = "Author: Rene Wiegandt <Rene.Wiegandt@mpi-bn.mpg.de>")

opt <- parse_args(opt_parser)

#' Reading files with fread.
#' Only read the first column.
#' @param path Path to file
#' @return first column as vector
read_data <- function(path){
renewiegandt marked this conversation as resolved.
Show resolved Hide resolved

f <- data.table::fread(path, select = 1)
return(f[[1]])
}


#' Creating JSON-file with sequence ids which were used to create the best (num) motifs.
#'
#' @param input Input file.Output txt-file from GLAM2.
#' @param output Output JSON-file
#' @param num Get best (num) motifs.
#'
#' @author René Wiegandt <Rene.Wiegandt(at)mpi-bn.mpg.de>
create_seq_json <- function(input, output, num, tmp_path, cluster_id) {

if (!file.exists(input)) {
stop(paste0("Input file does not exists. Please check the given path: ", input))
}

if ( !is.numeric(num)) {
stop("Parameter num needs to be an integer")
}

if (num > 10 || num <= 0 ) {
stop(paste0("Parameter 'num' needs to be an number between 1 and 10! Your input: ", num))
}

if ( !varhandle::check.numeric(cluster_id)) {
stop(paste0("CLUSTER ID could not be found. Please make sure that your file path contains _[cluster_id] at the end. Found: ", cluster_id,"\n For example: /test_cluster_1/glam.txt"))
}

dir.create(tmp_path, showWarnings = FALSE)

file_dir <- tmp_path

# Split glam.txt file on lines that start with Score:
system(paste0("csplit ", input, " '/^Score:.*/' '{*}' -f ", file_dir, "/f_id_test.pholder"))
# Only keep the lines that start with 'f' to get the lines with the sequence ids
system(paste0("for i in ", file_dir, "/*.pholder0[1-", num, "];do grep \"^f\" $i > \"${i}.done\";done"))

# Getting the filepaths of first 3 files with sequence ids
fnames <- file.path(file_dir,dir(file_dir, pattern = "done"))

# Running read_data on files
datalist <- lapply(fnames, read_data)


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove second empty line.

# Create json file
## naming
names(datalist) <- paste0(c("Motif_", "Motif_", "Motif_"),seq(1,as.numeric(num),1) , " Cluster_", cluster_id)
## creating json object
json <- RJSONIO::toJSON(datalist, pretty = T , .withNames = T)
## writing file
write(json, file = output )
}

# run function create_seq_json with given parameteres if not in interactive context (e.g. run from shell)
if (!interactive()) {
if (length(commandArgs(trailingOnly = TRUE)) <= 0) {
print_help(opt_parser)
} else {
create_seq_json(opt$input, opt$output, opt$num, opt$tmp, opt$cluster_id)
}
}
1 change: 1 addition & 0 deletions bin/2.2_motif_estimation/motif_estimation.nf
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env nextflow

params.bed = ""
params.out = ""
Expand Down
2 changes: 2 additions & 0 deletions masterenv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ dependencies:
- r-stringi
- r-stringr
- r-optparse
- r-RJSONIO
- r-varhandle
- bioconductor-iranges
- moods
- biopython
Expand Down