diff --git a/reformat_output.R b/reformat_output.R
index 7bd4355..eec906e 100755
--- a/reformat_output.R
+++ b/reformat_output.R
@@ -1,7 +1,8 @@
#!/usr/bin/env Rscript
-## author afust NEU
-## reformat BestperQuery_Hits table to table with all information about one peak in one row
+## author afust
+## reformat BestperQuery_Hits table to table with all information about one peak in one row
+library(getopt)
## columns given with $3 should be unique and not be seprated by the delimiter
## with the aggregate function the same entries are sepereated by it, so this has to be adjusted
@@ -11,73 +12,92 @@
}
return(entry)
}
-
#Script gets arguments
-args <- commandArgs(TRUE)
-# argument the scipt gets
-# $1 file which should be reformatted
-# $2 key column
-# $3 columns that should be kept
-# $4 delimiter
-# $5 #cores
-if(length(args)>=4 && length(args)<=5){
- print(Sys.time())
- # Process arguments
- df.hits <- read.table(args[1], header=TRUE, comment.char="#", sep="\t",check.names=FALSE, stringsAsFactors = FALSE)
- key <- as.character(args[2])
- keep.cols <- unlist(sapply(unlist(strsplit(args[3],",")), function(t) eval(parse(text=t))))
- delimiter <- as.character(args[4])
- cols<- colnames(df.hits)
- ## create output
- output <- dirname(normalizePath(args[1]))
- filename <- strsplit(basename(args[1]),"[.]")[[1]][1]
- filename <- paste0(filename,"_compact.txt")
- output <- paste(output,filename,sep="/")
- # if multiple cores are present, use them.
- if(length(args)==5){
- cores <- as.numeric(args[5])
- library(snow)
- c <- makeSOCKcluster(rep("localhost",cores))
- ## replace occurence of delimiter in data
- if(delimiter != ";"){
- df.hits[] <- parLapply(c,df.hits, gsub, pattern=delimiter, replacement=";", fixed=TRUE)
- } else {
- df.hits[] <- parLapply(c,df.hits, gsub, pattern=delimiter, replacement=",", fixed=TRUE)
- }
+# 0 flag
+# 1 mandatory parameter
+# 2 optional parameter
+options <- matrix(c(
+ 'input', 'i', 1, 'character', 'file which should be reformatted',
+ 'key', 'k', 1, 'character', 'key columns seperated by "," without spaces',
+ 'cols', 'c', 1, 'character', 'columns that should be kept ',
+ 'delimiter', 'd', 2, 'character', 'delimiter [,]',
+ 'threads', 't', 2, 'integer', 'cores to be used for reformatting',
+ 'help', 'h', 0, 'logical','Provides command line help.'
+ ), byrow=TRUE, ncol=5)
+opt <- getopt(options)
+#help
+if (!is.null(opt$help)) {
+ cat(getopt(options, usage=TRUE))
+ q(status=0) }
+#check for mandatory input
+if (is.null(opt$input) || !file.exists(opt$input)) {
+ cat("\nInput file to reformat is missing or not existend\n")
+ q(status=1)
+}
+if (is.null(opt$key)) {
+ cat("\nKey column is missing\n")
+ q(status=1)
+}
+if (is.null(opt$cols)) {
+ cat("\nColumns to keep are missing\n")
+ q(status=1)
+}
+#set defaults
+if (is.null(opt$delimiter)) { opt$delimiter <- "," }
+if (is.null(opt$threads)) { opt$threads <- 1 }
+# Process parameter
+df.hits <- read.table(opt$input, header=TRUE, comment.char="#", sep="\t",check.names=FALSE, stringsAsFactors = FALSE)
+cols <- colnames(df.hits)
+key <- as.character(opt$key)
+keep.cols <- unlist(sapply(unlist(strsplit(opt$cols,",")), function(t) eval(parse(text=t))))
+delimiter <- as.character(opt$delimiter)
+
+## create output
+output <- dirname(normalizePath(opt$input))
+filename <- strsplit(basename(opt$input),"[.]")[[1]][1]
+filename <- paste0(filename,"_compact.txt")
+output <- paste(output,filename,sep="/")
+# if multiple cores are present, use them.
+if(opt$threads > 1){
+ cores <- as.numeric(opt$threads)
+ library(snow)
+ c <- makeSOCKcluster(rep("localhost",cores))
+ ## replace occurence of delimiter in data
+ if(delimiter != ";"){
+ df.hits[] <- parLapply(c,df.hits, gsub, pattern=delimiter, replacement=";", fixed=TRUE)
} else {
- ## replace occurence of delimiter in data
- if(delimiter != ";"){
- df.hits[] <- lapply(df.hits, gsub, pattern=delimiter, replacement=";", fixed=TRUE)
- } else {
- df.hits[] <- lapply(df.hits, gsub, pattern=delimiter, replacement=",", fixed=TRUE)
- }
+ df.hits[] <- parLapply(c,df.hits, gsub, pattern=delimiter, replacement=",", fixed=TRUE)
+ }
+} else {
+ ## replace occurence of delimiter in data
+ if(delimiter != ";"){
+ df.hits[] <- lapply(df.hits, gsub, pattern=delimiter, replacement=";", fixed=TRUE)
+ } else {
+ df.hits[] <- lapply(df.hits, gsub, pattern=delimiter, replacement=",", fixed=TRUE)
}
- # combind data by key column
- df.reformat <- aggregate(.~df.hits[,key], data=df.hits, FUN=paste, collapse=delimiter, na.action=na.pass)
- # remove key column and replace by aggregated column
- df.reformat[,key] <- NULL
- cols.new <- colnames(df.reformat)
- cols.new[1] <- key
- colnames(df.reformat) <- cols.new
- #original column order
- df.reformat <- df.reformat[,cols]
- # transform columns that should be kept
- for(i in 1:length(cols[keep.cols])){
- if(length(args)==5){
- col.unique <- parLapply(c, df.reformat[,keep.cols[i]], .reformat.keep.cols, delimiter)
- df.reformat[,keep.cols[i]] <- unlist(col.unique)
- if(i==length(cols[keep.cols])){
- stopCluster(c)
- }
- } else {
- col.unique <- lapply(df.reformat[,keep.cols[i]], .reformat.keep.cols, delimiter)
- df.reformat[,keep.cols[i]] <- unlist(col.unique)
+}
+# combind data by key column
+df.reformat <- aggregate(.~df.hits[,key], data=df.hits, FUN=paste, collapse=delimiter, na.action=na.pass)
+# remove key column and replace by aggregated column
+df.reformat[,key] <- NULL
+cols.new <- colnames(df.reformat)
+cols.new[1] <- key
+colnames(df.reformat) <- cols.new
+#original column order
+df.reformat <- df.reformat[,cols]
+# transform columns that should be kept
+for(i in 1:length(cols[keep.cols])){
+ if(opt$threads > 1){
+ col.unique <- parLapply(c, df.reformat[,keep.cols[i]], .reformat.keep.cols, delimiter)
+ df.reformat[,keep.cols[i]] <- unlist(col.unique)
+ if(i==length(cols[keep.cols])){
+ stopCluster(c)
}
-
+ } else {
+ col.unique <- lapply(df.reformat[,keep.cols[i]], .reformat.keep.cols, delimiter)
+ df.reformat[,keep.cols[i]] <- unlist(col.unique)
}
- #write to file
- write.table(df.reformat, output, append =FALSE, quote=FALSE,sep='\t', eol='\n',row.names = FALSE, col.names = TRUE)
- } else {
- cat("ERROR wrong usage of script, use like this:\nRscript reformat.R <<#threads>>\n\nFor example: Rscript reformat.R besthits.txt peak_id 1:3,5 ',' 5\nOr: Rscript reformat.R besthits.txt peak_id 1,3,5 '#'\nLast argument is optional, a number of threads can be added.\n")
- }
+}
+#write to file
+write.table(df.reformat, output, append =FALSE, quote=FALSE,sep='\t', eol='\n',row.names = FALSE, col.names = TRUE)
\ No newline at end of file