diff --git a/CHANGES b/CHANGES index eb0c2f7..9c7004c 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,9 @@ +## 1.1.2 (2017-09-04) + +- Added proper help for auxillary R scripts (exit code 0) +- Made call to reformat_output.R multi-threaded +- Cleaned some code, added citation + ## 1.1.1 (2017-03-08) - Bug fix for possible older versions of GNU sort, the order of results may have changed now diff --git a/summary.R b/summary.R index 8f25dff..474e7e4 100755 --- a/summary.R +++ b/summary.R @@ -15,16 +15,36 @@ suppressPackageStartupMessages(library(ggplot2)) suppressPackageStartupMessages(library(jsonlite)) suppressPackageStartupMessages(library(gridExtra)) suppressPackageStartupMessages(library(grid)) +suppressPackageStartupMessages(library(getopt)) + +options = matrix(c( + 'finalhits', 'f', 1, 'character', 'file containing the final hits from UROPA.', + 'config', 'c', 1, 'character', 'file containing the json formatted configuration from the UROPA run.', + 'output', 'o', 2, 'character', 'file name of output file [summary.pdf].', + 'besthits', 'b', 2, 'character', 'file containing the best hits from UROPA.', + 'help', 'h', 0, 'logical','Provides command line help.' + ), byrow=TRUE, ncol=5) +opt = getopt(options) + +# End if help is requested +if (!is.null(opt$help)) { + cat(getopt(options, usage=TRUE), file = stdout()); q(status=0) +} -# script gets arguments -args <- commandArgs(TRUE) -# basic information independent if there are 3 or 4 input arguments, used otherwhere as .basic.summary +# Also exit if the mandatory parameters are not given +if (is.null(opt$finalhits) | is.null(opt$config)) { + cat(getopt(options, usage=TRUE), file = stdout()); q(status=1) +} +# Default values for options +if (is.null(opt$output)) { + opt$output <- "summary.pdf" +} + +# basic information independent if there are 3 or 4 input arguments, used otherwhere as .basic.summary num.features <- 0 features <- c() - - # reformat every row of the config.query file to string for cover page .print.query <- function(row){ r <- paste(unname(as.vector(unlist(row))), collapse=" ") @@ -40,11 +60,12 @@ features <- c() viewport(layout.pos.row = 1, layout.pos.col = 1) } else { viewport(layout.pos.row = 2, layout.pos.col = 1) - } -} + } +} + # helper for plot 2 and 5 (counts the occurence of each loci for pie charts) .plot.genomic.location.per.feature.helper <- function(f, df.uropa, pie.basic){ - + feat <- features[f] df.feature <- subset(df.uropa, df.uropa$feature==feat) unique.loci <- sort(unique(df.feature$genomic_location)) @@ -70,7 +91,7 @@ features <- c() blank_theme <- theme_minimal()+ theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold")) # title main <- paste0("Genomic location of '",feat, "' across ",pie.basic) - pie <- ggplot(df.pie, aes(x="", y=value, fill=location)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + blank_theme + theme(axis.text.x=element_blank()) + geom_text(aes(y = value/length(value) + c(0, cumsum(value)[-length(value)]), label = rep("",nrow(df.pie))), size=2, nudge_x = 0.7, nudge_y = 0.7) + ggtitle(main) + theme(plot.title = element_text(size = 10, face = "bold", , vjust=-10)) + pie <- ggplot(df.pie, aes(x="", y=value, fill=location)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + blank_theme + theme(axis.text.x=element_blank()) + geom_text(aes(y = value/length(value) + c(0, cumsum(value)[-length(value)]), label = rep("",nrow(df.pie))), size=2, nudge_x = 0.7, nudge_y = 0.7) + ggtitle(main) + theme(plot.title = element_text(size = 10, face = "bold", , vjust=-10)) print(pie, vp=.define_region(f)) } @@ -79,9 +100,10 @@ features <- c() # layout of plot # if there is more than one feature, there should two columns and the according number of rows for(f in 1:num.features){ - .plot.genomic.location.per.feature.helper(f, df.uropa, pie.basic) + .plot.genomic.location.per.feature.helper(f, df.uropa, pie.basic) } } + # plot 3 and 6 .plot.feature.distribution <- function(df.uropa, header){ if(num.features>1){ @@ -91,7 +113,7 @@ features <- c() o <- nrow(subset(df.uropa, df.uropa$feature==feat)) occurence.features <- c(occurence.features,o) } - names(occurence.features) <- features + names(occurence.features) <- features barplot(occurence.features, ylab="occurence",main=header) } } @@ -110,7 +132,7 @@ features <- c() # get infos from config for overview page config <- fromJSON(conf) config.query <- as.data.frame(config$queries) - + config.cols <- colnames(config.query) priority <- config$priority @@ -124,31 +146,31 @@ features <- c() if(grepl("T",priority) && num.queries>1){ priority <- paste0(priority, "\nNote: No pairwise comparisons and Chow Ruskey plot (no overlaps)") } - # add query to query data frame + # add query to query data frame config.query$query <- paste0("query",sprintf("%02d",0:(nrow(config.query)-1))) - config.query <- config.query[,c("query", "feature", "distance", "feature.anchor","internals", "strand","direction","filter.attribute", "attribute.value","show.attributes")] - # replaye "start,center,end" position by "any_pos" + config.query <- config.query[,c("query", "feature", "distance", "feature.anchor", "internals", "direction", "filter.attribute", "attribute.value", "show.attributes")] + # replaye "start,center,end" position by "any_pos" config.query$feature.anchor <- sapply(config.query$feature.anchor, function(x) if(length(x)==3){return("any_pos")}else{return(x)}) mtext("UROPA summary", side=3, line=0,outer=FALSE, cex=2) mtext(paste0("There were ", num.peaks, " peaks in the input bed file,\nUROPA annotated ", anno.peaks, " peaks\n"),side=3, line=-3,outer=FALSE, cex=.7) if(num.queries != nrow(config.query)){ - mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), side=3, line=-5,outer=FALSE, cex=.7) + mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), side=3, line=-5,outer=FALSE, cex=.7) } mytheme <- ttheme_default(core = list(fg_params=list(cex = 0.5)),colhead = list(fg_params=list(cex = 0.5)),rowhead = list(fg_params=list(cex = 0.5))) - config.query <- data.frame(lapply(config.query, as.character), stringsAsFactors=FALSE) + config.query <- data.frame(lapply(config.query, as.character), stringsAsFactors=FALSE) g <- tableGrob(format(config.query), theme=mytheme,rows=NULL) grid.draw(g) - + mtext(paste0("priority: ", priority), cex=.7,side=1, line=-2) input <- paste("Input:",unlist(config$bed),collapse=" ") anno <- paste("Anno:",unlist(config$gtf),collapse=" ") input.anno <- paste(input,anno,sep="\n") mtext(input.anno,cex=.6, side=1, line=0) - # plot 1 + # plot 1 # description plot1 <- paste0("1. Distances of annotated peaks in finalhits:", "\n\nThe following density plot displays the distance of anntotated peaks\nto its feature(s) based on the finalhits.\n\n", @@ -168,9 +190,9 @@ features <- c() } else { y.lim <- median(df.uropa.final[,"distance"]) + 5000 } - + } - + density <- subset(df.uropa.final[,c("distance","feature")], (df.uropa.final[,"distance"] max.uropa.best || considered.distance>10000){ if(median.uropa.best + 5000 > max.uropa.best){ - considered.distance <- median.uropa.best + considered.distance <- median.uropa.best } else { considered.distance <- median.uropa.best + 5000 } - - } + + } df.distance.query <- subset(df.uropa.best.per.query[,c("feature","distance","query")], (df.uropa.best.per.query[,"distance"] < considered.distance)) max.distance.query <- round(max(as.numeric(df.distance.query[,"distance"]))) bin.width <- round(max.distance.query/20) @@ -305,7 +327,7 @@ if(length(args)==3){ for(q in 1:(num.queries-1)){ initial.query <- peaks.per.query[[paste0("query",queries[q])]] if(tmp.num.query < num.queries){ - for(c in num.queries:tmp.num.query){ + for(c in num.queries:tmp.num.query){ if(queries[q] != queries[c]){ compare.query <- peaks.per.query[[paste0("query",queries[c])]] venn.object <- venn(list(initial.query, compare.query)) @@ -319,16 +341,16 @@ if(length(args)==3){ dist <- 0.02 } else if(anno.compare.query == matches || anno.initial.query ==matches){ dist <- c(0.02, -0.02, -0.02) - } - venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])), - lwd = 2, lty ="solid", col = c("black", "red","blue"), fill = c("white","red","blue"), cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE, + } + venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])), + lwd = 2, lty ="solid", col = c("black", "red","blue"), fill = c("white","red","blue"), cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE, cat.cex =1, cat.default.pos= "outer", alpha = c(0,.4,.4), euler.d=TRUE, scaled=TRUE) grid.draw(venn.plot) mtext(paste0("Peak based pairwise compare of ", paste0("query",queries[q]), " and ", paste0("query",queries[c])), side=3, line=0,outer=FALSE, cex=1) popViewport() - } + } } - } + } tmp.num.query <- tmp.num.query+1 } @@ -344,7 +366,7 @@ if(length(args)==3){ "\n\nAdditional Info:\nIt is evalueated whether peaks are annotated,\nbut not whether they are annotated for the same feature") grid.newpage() mtext(plot8, cex=1, adj=0, padj=1) - + num <- venn(peaks.per.query)[,"num"] if(num[length(num)] == 0){ num <- num+1 @@ -352,17 +374,17 @@ if(length(args)==3){ # compute distribution v <- Venn(SetNames=paste0("query",queries), Weight=num) cv <- compute.Venn(v, type="ChowRuskey", doWeights=TRUE) - + # change FaceText size and line thickness gp <- VennThemes(cv, colourAlgorithm = "signature") gp[["FaceText"]] <- lapply(gp[["FaceText"]],function(gps){gps$fontsize<-10;gps}) gp[["Set"]] <- lapply(gp[["Set"]],function(gps){gps$lwd<-.8;gps}) - + #change label positions - SetLabels <- VennGetSetLabels(cv) + SetLabels <- VennGetSetLabels(cv) max.x <- max(VennGetUniverseRange(cv)[,"x"]) max.y <- max(VennGetUniverseRange(cv)[,"y"]) - for(i in 1:nrow(SetLabels)){ + for(i in 1:nrow(SetLabels)){ SetLabels[i,"x"] <- max.x+2 SetLabels[i,"hjust"] <- "right" if(i==1){ @@ -373,7 +395,7 @@ if(length(args)==3){ } cv <- VennSetSetLabels(cv,SetLabels) - # change face label positions - not yet implemented in original package + # change face label positions - not yet implemented in original package facelabel <- as.data.frame(VennGetFaceLabels(cv)) face.labels <- nrow(facelabel) for(i in 1:(face.labels-2)){ @@ -384,7 +406,7 @@ if(length(args)==3){ # draw plot pushViewport(viewport(x=.5, y=0.5, width=.8, height=.7)) - grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1.5, "npc"), height = unit(1.5, "npc")) + grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1.5, "npc"), height = unit(1.5, "npc")) venn.draw <- plot(cv, gp=gp) grid.draw(venn.draw) mtext("Chow Ruskey comparison of all peaks annotated with UROPA", side=3, line=0,outer=FALSE, cex=1) @@ -393,6 +415,4 @@ if(length(args)==3){ }) } dev.off() -} else { - stop("Usage: Rscript summary.R finalhits.txt summary_config.json summary.pdf besthits.txt") } diff --git a/uropa.py b/uropa.py index ec178d1..8e7c604 100755 --- a/uropa.py +++ b/uropa.py @@ -4,7 +4,7 @@ @authors: Maria Kondili, Jens Preussner and Annika Fust @license: MIT -@version: 1.1.0 +@version: 1.1.2 @maintainer: Mario Looso @email: mario.looso@mpi-bn.mpg.de """ @@ -95,7 +95,7 @@ "--version", help="prints the version and exits", action="version", - version="%(prog)s 1.1.1") + version="%(prog)s 1.1.2") args = parser.parse_args() config = args.input @@ -338,6 +338,7 @@ logger.debug("Filenames for output files are: {}, {}". format(allhits_outfile, besthits_outfile)) ovls.finalize_file(allhits_outfile, allhits_partials, header, comments, log=logger) + # # Reformat output # @@ -345,34 +346,25 @@ logger.info("Reformatting output...") R_reform_Best = [ 'reformat_output.R', + '-i', besthits_outfile, + '-k', 'peak_id', + '-c', '1:5', - ','] - reformatBest_out = outdir + "Reformatted_" + \ - os.path.basename(besthits_outfile) + '-d', + ',', + '-t', + str(args.threads)] try: # creates output of Best and gives name "Reformatted_" + logger.debug('Reformat output call is {}'.format(R_reform_Best)) pr_R = sp.check_output(R_reform_Best) except sp.CalledProcessError: logger.warning("Reformatted output could not be created with call %s", ' '.join(R_reform_Best)) except OSError: logger.warning("Rscript command not available for summary output.") - # Add header and write to new file: - reformatted_outfile = "Uropa_Reformatted_" + \ - os.path.basename(besthits_outfile) - if os.path.exists(outdir + reformatBest_out): - ovls.write_out_file( - outdir + - reformatted_outfile, - reformatBest_out, - header_comments, - header) - if os.path.exists(outdir + reformatted_outfile): - logger.info("Reformatted BestHits output: %s", reformatted_outfile) - os.remove(outdir + reformatBest_out) - # # Visual summary # @@ -384,17 +376,25 @@ if len(queries) > 1 and not pr and os.path.exists(merged_outfile): call = [ summary_script, + "-f", merged_outfile, + "-c", outdir + "summary_config.json", + "-o", summary_output, + "-b", besthits_outfile] else: call = [ summary_script, + "-f", besthits_outfile, + "-c", outdir + "summary_config.json", + "-o", summary_output] try: + logger.debug('Summary output call is {}'.format(call)) sum_pr = sp.check_output(call) except sp.CalledProcessError: logger.warning("Visualized summary output could not be created from %s.", ' '.join(call)) diff --git a/uropa/config.py b/uropa/config.py index a9c99e8..b08949d 100644 --- a/uropa/config.py +++ b/uropa/config.py @@ -16,6 +16,11 @@ def howtoconfig(): HOMER or ChIPpeakAnno, but the advantage of UROPA is, that it can easily be fitted to your requirements. UROPA was developed as an open source analysis pipeline for peaks generated from any peak caller. + Please cite upon usage: + Kondili M, Fust A, Preussner J, Kuenne C, Braun T, and Looso M. + UROPA: a tool for Universal RObust Peak Annotation. + Scientific Reports 7 (2017), doi: 10.1038/s41598-017-02464-y + All parameters and paths to input or output files should be reported in a JSON configuration file. The configuration file should at least contain paths for bed and GTF files: