Updated UROPA to v1.1.2

loosolab · Sep 4, 2017 · 11c0cb4 · 11c0cb4
1 parent 6bfae67
commit 11c0cb4
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 68 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,9 @@
+## 1.1.2 (2017-09-04)
+
+- Added proper help for auxillary R scripts (exit code 0)
+- Made call to reformat_output.R multi-threaded
+- Cleaned some code, added citation
+
 ## 1.1.1 (2017-03-08)
 
 - Bug fix for possible older versions of GNU sort, the order of results may have changed now

diff --git a/summary.R b/summary.R
@@ -15,16 +15,36 @@ suppressPackageStartupMessages(library(ggplot2))
 suppressPackageStartupMessages(library(jsonlite))
 suppressPackageStartupMessages(library(gridExtra))
 suppressPackageStartupMessages(library(grid))
+suppressPackageStartupMessages(library(getopt))
+
+options = matrix(c(
+  'finalhits', 'f', 1, 'character', 'file containing the final hits from UROPA.',
+  'config', 'c', 1, 'character', 'file containing the json formatted configuration from the UROPA run.',
+  'output', 'o', 2, 'character', 'file name of output file [summary.pdf].',
+  'besthits', 'b', 2, 'character', 'file containing the best hits from UROPA.',
+  'help', 'h', 0, 'logical','Provides command line help.'
+  ), byrow=TRUE, ncol=5)
+opt = getopt(options)
+
+# End if help is requested
+if (!is.null(opt$help)) {
+  cat(getopt(options, usage=TRUE), file = stdout()); q(status=0)
+}
 
-# script gets arguments
-args <- commandArgs(TRUE)
-# basic information independent if there are 3 or 4 input arguments, used otherwhere as .basic.summary
+# Also exit if the mandatory parameters are not given
+if (is.null(opt$finalhits) | is.null(opt$config)) {
+  cat(getopt(options, usage=TRUE), file = stdout()); q(status=1)
+}
 
+# Default values for options
+if (is.null(opt$output)) {
+  opt$output <- "summary.pdf"
+}
+
+# basic information independent if there are 3 or 4 input arguments, used otherwhere as .basic.summary
 num.features <- 0
 features <- c()
 
-
-
 # reformat every row of the config.query file to string for cover page
 .print.query <- function(row){
 	r <- paste(unname(as.vector(unlist(row))), collapse="   ")
@@ -40,11 +60,12 @@ features <- c()
 		viewport(layout.pos.row = 1, layout.pos.col = 1)
 	} else {
 		viewport(layout.pos.row = 2, layout.pos.col = 1)
-	} 
-} 
+	}
+}
+
 # helper for plot 2 and 5 (counts the occurence of each loci for pie charts)
 .plot.genomic.location.per.feature.helper <- function(f, df.uropa, pie.basic){
-	
+
 	feat <- features[f]
 	df.feature <- subset(df.uropa, df.uropa$feature==feat)
 	unique.loci <- sort(unique(df.feature$genomic_location))
@@ -70,7 +91,7 @@ features <- c()
 	blank_theme <- theme_minimal()+ theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))
 	# title
 	main <- paste0("Genomic location of '",feat, "' across ",pie.basic)
-	pie <- ggplot(df.pie, aes(x="", y=value, fill=location)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + blank_theme + theme(axis.text.x=element_blank()) + geom_text(aes(y = value/length(value) + c(0, cumsum(value)[-length(value)]),  label = rep("",nrow(df.pie))), size=2, nudge_x = 0.7, nudge_y = 0.7) + ggtitle(main) +  theme(plot.title = element_text(size = 10, face = "bold", , vjust=-10))	
+	pie <- ggplot(df.pie, aes(x="", y=value, fill=location)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + blank_theme + theme(axis.text.x=element_blank()) + geom_text(aes(y = value/length(value) + c(0, cumsum(value)[-length(value)]),  label = rep("",nrow(df.pie))), size=2, nudge_x = 0.7, nudge_y = 0.7) + ggtitle(main) +  theme(plot.title = element_text(size = 10, face = "bold", , vjust=-10))
 	print(pie, vp=.define_region(f))
 }
 
@@ -79,9 +100,10 @@ features <- c()
 	# layout of plot
 	# if there is more than one feature, there should two columns and the according number of rows
 	for(f in 1:num.features){
-		.plot.genomic.location.per.feature.helper(f, df.uropa, pie.basic)			
+		.plot.genomic.location.per.feature.helper(f, df.uropa, pie.basic)
 	}
 }
+
 # plot 3 and 6
 .plot.feature.distribution <- function(df.uropa, header){
 	if(num.features>1){
@@ -91,7 +113,7 @@ features <- c()
 			o <- nrow(subset(df.uropa, df.uropa$feature==feat))
 			occurence.features <- c(occurence.features,o)
 		}
-		names(occurence.features) <- features	
+		names(occurence.features) <- features
 		barplot(occurence.features, ylab="occurence",main=header)
 	}
 }
@@ -110,7 +132,7 @@ features <- c()
 	# get infos from config for overview page
 	config <- fromJSON(conf)
 	config.query <- as.data.frame(config$queries)
-	
+
 	config.cols <- colnames(config.query)
 	priority <- config$priority
 
@@ -124,31 +146,31 @@ features <- c()
 	if(grepl("T",priority) && num.queries>1){
 		priority <- paste0(priority, "\nNote: No pairwise comparisons and Chow Ruskey plot (no overlaps)")
 	}
-	# add query to query data frame	
+	# add query to query data frame
 	config.query$query <- paste0("query",sprintf("%02d",0:(nrow(config.query)-1)))
-	config.query <- config.query[,c("query", "feature", "distance", "feature.anchor","internals", "strand","direction","filter.attribute", "attribute.value","show.attributes")]
-	# replaye "start,center,end" position by "any_pos" 
+	config.query <- config.query[,c("query", "feature", "distance", "feature.anchor", "internals", "direction", "filter.attribute", "attribute.value", "show.attributes")]
+	# replaye "start,center,end" position by "any_pos"
 	config.query$feature.anchor <- sapply(config.query$feature.anchor, function(x) if(length(x)==3){return("any_pos")}else{return(x)})
 	mtext("UROPA summary", side=3, line=0,outer=FALSE, cex=2)
 	mtext(paste0("There were ", num.peaks, " peaks in the input bed file,\nUROPA annotated ", anno.peaks, " peaks\n"),side=3, line=-3,outer=FALSE, cex=.7)
 	if(num.queries != nrow(config.query)){
-		mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), side=3, line=-5,outer=FALSE, cex=.7)	
+		mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), side=3, line=-5,outer=FALSE, cex=.7)
 	}
 
 
 
 	mytheme <- ttheme_default(core = list(fg_params=list(cex = 0.5)),colhead = list(fg_params=list(cex = 0.5)),rowhead = list(fg_params=list(cex = 0.5)))
-	config.query <- data.frame(lapply(config.query, as.character), stringsAsFactors=FALSE)	
+	config.query <- data.frame(lapply(config.query, as.character), stringsAsFactors=FALSE)
 	g <- tableGrob(format(config.query), theme=mytheme,rows=NULL)
 	grid.draw(g)
-	
+
 	mtext(paste0("priority: ", priority), cex=.7,side=1, line=-2)
 	input <- paste("Input:",unlist(config$bed),collapse=" ")
 	anno <- paste("Anno:",unlist(config$gtf),collapse=" ")
 	input.anno <- paste(input,anno,sep="\n")
 	mtext(input.anno,cex=.6, side=1, line=0)
 
-	# plot 1 
+	# plot 1
 	# description
 	plot1 <- paste0("1. Distances of annotated peaks in finalhits:",
 		"\n\nThe following density plot displays the distance of anntotated peaks\nto its feature(s) based on the finalhits.\n\n",
@@ -168,9 +190,9 @@ features <- c()
 			} else {
 				y.lim <- median(df.uropa.final[,"distance"]) + 5000
 			}
-		
+
 	}
-	
+
 	density <- subset(df.uropa.final[,c("distance","feature")], (df.uropa.final[,"distance"]<y.lim))
 	dpq <- qplot(distance,data=density, geom="density", color=feature, xlab = "Distance to feature", ylab = "Relative count")
 	print(dpq + ggtitle("Distance to features across finalhits"))
@@ -200,7 +222,7 @@ features <- c()
 		mtext(plot3, cex=1, adj=0, padj=1)
 		.plot.feature.distribution(df.uropa=df.uropa.final,header="Feature distribution across finalhits")
 	}
-	
+
 	return(df.uropa.final)
 }
 
@@ -209,18 +231,18 @@ features <- c()
 # arg 3 = output pdf
 # arg 4 = besthits
 
-if(length(args)==3){
-	## basic summary -- only one query definded
-	df.uropa.final <- .basic.summary(args[1],args[2], args[3])
-	dev.off()
-} else if(length(args)==4){
+## basic summary -- only one query definded
+if (is.null(opt$besthits)) {
+  df.uropa.final <- .basic.summary(opt$finalhits, opt$config, opt$output)
+  dev.off()
+} else
+{
 	suppressPackageStartupMessages(library(VennDiagram))
 	suppressPackageStartupMessages(library(gplots))
 
-
-	## increased summary -- there is more than one query definded
-	df.uropa.final <- .basic.summary(args[1],args[2], args[3])
-	df.uropa.best.per.query <- read.table(args[4], header=TRUE, sep="\t",stringsAsFactors = FALSE)
+  ## increased summary -- there is more than one query definded
+	df.uropa.final <- .basic.summary(opt$finalhits, opt$config, opt$output)
+	df.uropa.best.per.query <- read.table(opt$besthits, header=TRUE, sep="\t",stringsAsFactors = FALSE)
 	num.peaks <- length(unique(df.uropa.final$peak_id))
 	df.uropa.best.per.query[,"distance"] <- as.numeric(df.uropa.best.per.query[,"distance"])
 	df.uropa.best.per.query <- df.uropa.best.per.query[complete.cases(df.uropa.best.per.query),]
@@ -248,12 +270,12 @@ if(length(args)==3){
 	considered.distance <- median.uropa.best + round(max.uropa.best/15)
 	if(considered.distance > max.uropa.best || considered.distance>10000){
 		if(median.uropa.best + 5000 > max.uropa.best){
-			considered.distance <- median.uropa.best 
+			considered.distance <- median.uropa.best
 		} else {
 			considered.distance <- median.uropa.best + 5000
 		}
-		
-	} 
+
+	}
 	df.distance.query <- subset(df.uropa.best.per.query[,c("feature","distance","query")], (df.uropa.best.per.query[,"distance"] < considered.distance))
 	max.distance.query <- round(max(as.numeric(df.distance.query[,"distance"])))
 	bin.width <- round(max.distance.query/20)
@@ -305,7 +327,7 @@ if(length(args)==3){
 	for(q in 1:(num.queries-1)){
 		initial.query <- peaks.per.query[[paste0("query",queries[q])]]
 		if(tmp.num.query < num.queries){
-			for(c in num.queries:tmp.num.query){ 
+			for(c in num.queries:tmp.num.query){
 				if(queries[q] != queries[c]){
 					compare.query <- peaks.per.query[[paste0("query",queries[c])]]
 					venn.object <- venn(list(initial.query, compare.query))
@@ -319,16 +341,16 @@ if(length(args)==3){
 	 					dist <- 0.02
 	 				} else if(anno.compare.query == matches || anno.initial.query ==matches){
 	 					dist <- c(0.02, -0.02, -0.02)
-	 				} 
-	 				venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])), 
-	 					lwd = 2, lty ="solid", col =  c("black", "red","blue"), fill = c("white","red","blue"), cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE, 
+	 				}
+	 				venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])),
+	 					lwd = 2, lty ="solid", col =  c("black", "red","blue"), fill = c("white","red","blue"), cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE,
 	 					cat.cex =1, cat.default.pos= "outer", alpha =  c(0,.4,.4), euler.d=TRUE, scaled=TRUE)
 	 				grid.draw(venn.plot)
 	 				mtext(paste0("Peak based pairwise compare of ", paste0("query",queries[q]), " and ", paste0("query",queries[c])), side=3, line=0,outer=FALSE, cex=1)
 	 				popViewport()
-				}	
+				}
 			}
-		}		
+		}
 		tmp.num.query <- tmp.num.query+1
 	}
 
@@ -344,25 +366,25 @@ if(length(args)==3){
 				"\n\nAdditional Info:\nIt is evalueated whether peaks are annotated,\nbut not whether they are annotated for the same feature")
 			grid.newpage()
 			mtext(plot8, cex=1, adj=0, padj=1)
-			
+
 			num <- venn(peaks.per.query)[,"num"]
 			if(num[length(num)] == 0){
 				num <- num+1
 			}
 			# compute distribution
 			v <- Venn(SetNames=paste0("query",queries), Weight=num)
 			cv <- compute.Venn(v, type="ChowRuskey", doWeights=TRUE)
-	
+
 			# change FaceText size and line thickness
 			gp <- VennThemes(cv, colourAlgorithm = "signature")
 			gp[["FaceText"]] <- lapply(gp[["FaceText"]],function(gps){gps$fontsize<-10;gps})
 			gp[["Set"]] <- lapply(gp[["Set"]],function(gps){gps$lwd<-.8;gps})
-	
+
 			#change label positions
-			SetLabels <- VennGetSetLabels(cv)		
+			SetLabels <- VennGetSetLabels(cv)
 			max.x <- max(VennGetUniverseRange(cv)[,"x"])
 			max.y <- max(VennGetUniverseRange(cv)[,"y"])
-			for(i in 1:nrow(SetLabels)){		
+			for(i in 1:nrow(SetLabels)){
 				SetLabels[i,"x"] <- max.x+2
 				SetLabels[i,"hjust"] <- "right"
 				if(i==1){
@@ -373,7 +395,7 @@ if(length(args)==3){
 			}
 			cv <- VennSetSetLabels(cv,SetLabels)
 
-			# change face label positions - not yet implemented in original package			
+			# change face label positions - not yet implemented in original package
 			facelabel <- as.data.frame(VennGetFaceLabels(cv))
 			face.labels <- nrow(facelabel)
 			for(i in 1:(face.labels-2)){
@@ -384,7 +406,7 @@ if(length(args)==3){
 
 			# draw plot
 			pushViewport(viewport(x=.5, y=0.5, width=.8, height=.7))
-			grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1.5, "npc"), height = unit(1.5, "npc"))	
+			grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1.5, "npc"), height = unit(1.5, "npc"))
 			venn.draw <- plot(cv, gp=gp)
 			grid.draw(venn.draw)
 			mtext("Chow Ruskey comparison of all peaks annotated with UROPA", side=3, line=0,outer=FALSE, cex=1)
@@ -393,6 +415,4 @@ if(length(args)==3){
 		})
 	}
 	dev.off()
-} else {
-	stop("Usage: Rscript summary.R finalhits.txt summary_config.json summary.pdf besthits.txt")
 }
diff --git a/uropa.py b/uropa.py
@@ -4,7 +4,7 @@
 
 @authors: Maria Kondili, Jens Preussner and Annika Fust
 @license: MIT
-@version: 1.1.0
+@version: 1.1.2
 @maintainer: Mario Looso
 @email: mario.looso@mpi-bn.mpg.de
 """
@@ -95,7 +95,7 @@
         "--version",
         help="prints the version and exits",
         action="version",
-        version="%(prog)s 1.1.1")
+        version="%(prog)s 1.1.2")
     args = parser.parse_args()
 
     config = args.input
@@ -338,41 +338,33 @@
     logger.debug("Filenames for output files are: {}, {}". format(allhits_outfile, besthits_outfile))
 
     ovls.finalize_file(allhits_outfile, allhits_partials, header, comments, log=logger)
+
     #
     # Reformat output
     #
     if args.reformat and len(queries) > 1 and not pr:
         logger.info("Reformatting output...")
         R_reform_Best = [
             'reformat_output.R',
+            '-i',
             besthits_outfile,
+            '-k',
             'peak_id',
+            '-c',
             '1:5',
-            ',']
-        reformatBest_out = outdir + "Reformatted_" + \
-            os.path.basename(besthits_outfile)
+            '-d',
+            ',',
+            '-t',
+            str(args.threads)]
         try:
             # creates output of Best and gives name "Reformatted_"
+            logger.debug('Reformat output call is {}'.format(R_reform_Best))
             pr_R = sp.check_output(R_reform_Best)
         except sp.CalledProcessError:
             logger.warning("Reformatted output could not be created with call %s", ' '.join(R_reform_Best))
         except OSError:
             logger.warning("Rscript command not available for summary output.")
 
-        # Add header and write to new file:
-        reformatted_outfile = "Uropa_Reformatted_" + \
-            os.path.basename(besthits_outfile)
-        if os.path.exists(outdir + reformatBest_out):
-            ovls.write_out_file(
-                outdir +
-                reformatted_outfile,
-                reformatBest_out,
-                header_comments,
-                header)
-        if os.path.exists(outdir + reformatted_outfile):
-            logger.info("Reformatted BestHits output: %s", reformatted_outfile)
-            os.remove(outdir + reformatBest_out)
-
     #
     # Visual summary
     #
@@ -384,17 +376,25 @@
         if len(queries) > 1 and not pr and os.path.exists(merged_outfile):
             call = [
                 summary_script,
+                "-f",
                 merged_outfile,
+                "-c",
                 outdir + "summary_config.json",
+                "-o",
                 summary_output,
+                "-b",
                 besthits_outfile]
         else:
             call = [
                 summary_script,
+                "-f",
                 besthits_outfile,
+                "-c",
                 outdir + "summary_config.json",
+                "-o",
                 summary_output]
         try:
+            logger.debug('Summary output call is {}'.format(call))
             sum_pr = sp.check_output(call)
         except sp.CalledProcessError:
             logger.warning("Visualized summary output could not be created from %s.", ' '.join(call))