From 92203ffea6d7a1773547119757764c5472fc315c Mon Sep 17 00:00:00 2001 From: Annika Fust Date: Tue, 5 Sep 2017 15:13:36 +0200 Subject: [PATCH] added upset plot --- summary.R | 202 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 113 insertions(+), 89 deletions(-) diff --git a/summary.R b/summary.R index 474e7e4..4c14948 100755 --- a/summary.R +++ b/summary.R @@ -16,7 +16,7 @@ suppressPackageStartupMessages(library(jsonlite)) suppressPackageStartupMessages(library(gridExtra)) suppressPackageStartupMessages(library(grid)) suppressPackageStartupMessages(library(getopt)) - +suppressPackageStartupMessages(library(tidyr)) options = matrix(c( 'finalhits', 'f', 1, 'character', 'file containing the final hits from UROPA.', 'config', 'c', 1, 'character', 'file containing the json formatted configuration from the UROPA run.', @@ -88,10 +88,16 @@ features <- c() perc <- round(df.pie$value/sum(df.pie$value)*100,1) df.pie$location <- paste0(df.pie$location, " (",perc, "%)") # now background - blank_theme <- theme_minimal()+ theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold")) + blank_theme <- theme_minimal()+ theme(axis.title.x = element_blank(), axis.title.y = element_blank(), + panel.border = element_blank(), panel.grid=element_blank(), + axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold")) # title main <- paste0("Genomic location of '",feat, "' across ",pie.basic) - pie <- ggplot(df.pie, aes(x="", y=value, fill=location)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + blank_theme + theme(axis.text.x=element_blank()) + geom_text(aes(y = value/length(value) + c(0, cumsum(value)[-length(value)]), label = rep("",nrow(df.pie))), size=2, nudge_x = 0.7, nudge_y = 0.7) + ggtitle(main) + theme(plot.title = element_text(size = 10, face = "bold", , vjust=-10)) + pie <- ggplot(df.pie, aes(x="", y=value, fill=location)) + geom_bar(width = 1, stat = "identity") + + coord_polar("y", start=0) + blank_theme + ggtitle(main) + + theme(axis.text.x=element_blank(), plot.title = element_text(size = 10, face = "bold", vjust=-10)) + + geom_text(aes(y = value/length(value) + + c(0, cumsum(value)[-length(value)]), label = rep("",nrow(df.pie))), size=2, nudge_x = 0.7, nudge_y = 0.7) print(pie, vp=.define_region(f)) } @@ -132,14 +138,26 @@ features <- c() # get infos from config for overview page config <- fromJSON(conf) config.query <- as.data.frame(config$queries) - + config.query$query <- 0:(nrow(config.query)-1) config.cols <- colnames(config.query) priority <- config$priority - - # queries of uropa annotation run - queries <- sort.int(as.numeric(unique(df.uropa.final$query))) - num.queries <- length(queries) - queries <- sprintf("%02d", queries) + + + # specify y limit for plots + y.lim <- round(median(df.uropa.final[,"distance"]) + (max(df.uropa.final[,"distance"])/15)) + if(y.lim > max(df.uropa.final[,"distance"]) || y.lim > 10000){ + if(median(df.uropa.final[,"distance"]) + 5000 > max(df.uropa.final[,"distance"])){ + y.lim <- median(df.uropa.final[,"distance"]) + } else { + y.lim <- median(df.uropa.final[,"distance"]) + 5000 + } + + } + + # expand multiple valid annotations to one row each + df.uropa.final <- separate_rows(df.uropa.final, query) # queries of uropa annotation run + num.queries <- max(config.query$query) + queries <- sprintf("%02d", config.query$query) features <<- as.character(unique(df.uropa.final$feature)) num.features <<- length(features) @@ -148,18 +166,22 @@ features <- c() } # add query to query data frame config.query$query <- paste0("query",sprintf("%02d",0:(nrow(config.query)-1))) - config.query <- config.query[,c("query", "feature", "distance", "feature.anchor", "internals", "direction", "filter.attribute", "attribute.value", "show.attributes")] + config.query <- config.query[,c("query", "feature", "distance", "feature.anchor", "internals", "direction", + "filter.attribute", "attribute.value", "show.attributes")] # replaye "start,center,end" position by "any_pos" config.query$feature.anchor <- sapply(config.query$feature.anchor, function(x) if(length(x)==3){return("any_pos")}else{return(x)}) mtext("UROPA summary", side=3, line=0,outer=FALSE, cex=2) - mtext(paste0("There were ", num.peaks, " peaks in the input bed file,\nUROPA annotated ", anno.peaks, " peaks\n"),side=3, line=-3,outer=FALSE, cex=.7) + mtext(paste0("There were ", num.peaks, " peaks in the input bed file,\nUROPA annotated ", anno.peaks, " peaks\n"), + side=3, line=-3,outer=FALSE, cex=.7) if(num.queries != nrow(config.query)){ - mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), side=3, line=-5,outer=FALSE, cex=.7) + mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), + side=3, line=-5,outer=FALSE, cex=.7) } - mytheme <- ttheme_default(core = list(fg_params=list(cex = 0.5)),colhead = list(fg_params=list(cex = 0.5)),rowhead = list(fg_params=list(cex = 0.5))) + mytheme <- ttheme_default(core = list(fg_params=list(cex = 0.5)),colhead = list(fg_params=list(cex = 0.5)), + rowhead = list(fg_params=list(cex = 0.5))) config.query <- data.frame(lapply(config.query, as.character), stringsAsFactors=FALSE) g <- tableGrob(format(config.query), theme=mytheme,rows=NULL) grid.draw(g) @@ -183,16 +205,6 @@ features <- c() mtext(plot1, cex=1, adj=0, padj=1) # plot - y.lim <- round(median(df.uropa.final[,"distance"]) + (max(df.uropa.final[,"distance"])/15)) - if(y.lim > max(df.uropa.final[,"distance"]) || y.lim > 10000){ - if(median(df.uropa.final[,"distance"]) + 5000 > max(df.uropa.final[,"distance"])){ - y.lim <- median(df.uropa.final[,"distance"]) - } else { - y.lim <- median(df.uropa.final[,"distance"]) + 5000 - } - - } - density <- subset(df.uropa.final[,c("distance","feature")], (df.uropa.final[,"distance"] 1){ plot3 <- paste0("3. Allocation of available features in finalhits:", @@ -222,7 +234,6 @@ features <- c() mtext(plot3, cex=1, adj=0, padj=1) .plot.feature.distribution(df.uropa=df.uropa.final,header="Feature distribution across finalhits") } - return(df.uropa.final) } @@ -234,7 +245,7 @@ features <- c() ## basic summary -- only one query definded if (is.null(opt$besthits)) { df.uropa.final <- .basic.summary(opt$finalhits, opt$config, opt$output) - dev.off() + invisible(dev.off()) } else { suppressPackageStartupMessages(library(VennDiagram)) @@ -276,10 +287,12 @@ if (is.null(opt$besthits)) { } } - df.distance.query <- subset(df.uropa.best.per.query[,c("feature","distance","query")], (df.uropa.best.per.query[,"distance"] < considered.distance)) + df.distance.query <- subset(df.uropa.best.per.query[,c("feature","distance","query")], + (df.uropa.best.per.query[,"distance"] < considered.distance)) max.distance.query <- round(max(as.numeric(df.distance.query[,"distance"]))) bin.width <- round(max.distance.query/20) - dpq <- qplot(df.distance.query[,2],data =df.distance.query, facets=query~feature, geom="histogram", binwidth=bin.width, xlab = "Distance to feature", ylab = "Total count") + dpq <- qplot(df.distance.query[,2],data =df.distance.query, facets=query~feature, + geom="histogram", binwidth=bin.width, xlab = "Distance to feature", ylab = "Total count") print(dpq + ggtitle("Distance of query vs. feature across besthits Hits")) @@ -342,9 +355,11 @@ if (is.null(opt$besthits)) { } else if(anno.compare.query == matches || anno.initial.query ==matches){ dist <- c(0.02, -0.02, -0.02) } - venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])), - lwd = 2, lty ="solid", col = c("black", "red","blue"), fill = c("white","red","blue"), cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE, - cat.cex =1, cat.default.pos= "outer", alpha = c(0,.4,.4), euler.d=TRUE, scaled=TRUE) + venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, + anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])), + lwd = 2, lty ="solid", col = c("black", "red","blue"), fill = c("white","red","blue"), + cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE,cat.cex =1, + cat.default.pos= "outer", alpha = c(0,.4,.4), euler.d=TRUE, scaled=TRUE) grid.draw(venn.plot) mtext(paste0("Peak based pairwise compare of ", paste0("query",queries[q]), " and ", paste0("query",queries[c])), side=3, line=0,outer=FALSE, cex=1) popViewport() @@ -356,63 +371,72 @@ if (is.null(opt$besthits)) { # plot 8 # only up to five queries because of Vennerable package support - if(num.queries>2 && num.queries<=5){ - - suppressPackageStartupMessages(library(Vennerable)) - tryCatch({ - plot8 <- paste0("8: Comparison of all specified queries:", - "\n\nThe following Chow Ruskey plot compares all queries\nbased on the besthits.", - "\nIt represents an area-proportional Venn diagram,\nrevealing the distribution of peaks that could be annotated\nper query and works for up to 5 queries.", - "\n\nAdditional Info:\nIt is evalueated whether peaks are annotated,\nbut not whether they are annotated for the same feature") - grid.newpage() - mtext(plot8, cex=1, adj=0, padj=1) - - num <- venn(peaks.per.query)[,"num"] - if(num[length(num)] == 0){ - num <- num+1 - } - # compute distribution - v <- Venn(SetNames=paste0("query",queries), Weight=num) - cv <- compute.Venn(v, type="ChowRuskey", doWeights=TRUE) - - # change FaceText size and line thickness - gp <- VennThemes(cv, colourAlgorithm = "signature") - gp[["FaceText"]] <- lapply(gp[["FaceText"]],function(gps){gps$fontsize<-10;gps}) - gp[["Set"]] <- lapply(gp[["Set"]],function(gps){gps$lwd<-.8;gps}) - - #change label positions - SetLabels <- VennGetSetLabels(cv) - max.x <- max(VennGetUniverseRange(cv)[,"x"]) - max.y <- max(VennGetUniverseRange(cv)[,"y"]) - for(i in 1:nrow(SetLabels)){ - SetLabels[i,"x"] <- max.x+2 - SetLabels[i,"hjust"] <- "right" - if(i==1){ - SetLabels[i,"y"] <- max.y/3 - } else { - SetLabels[i,"y"] <- (as.numeric(SetLabels[i-1,"y"])-2)-(2.5*num.queries) - } - } - cv <- VennSetSetLabels(cv,SetLabels) - - # change face label positions - not yet implemented in original package - facelabel <- as.data.frame(VennGetFaceLabels(cv)) - face.labels <- nrow(facelabel) - for(i in 1:(face.labels-2)){ - facelabel[i,"x"] <- 1.15*facelabel[i,"x"] - facelabel[i,"y"] <- 1.15*facelabel[i,"y"] - } - cv <- VennSetFaceLabels(cv,facelabel) - - # draw plot - pushViewport(viewport(x=.5, y=0.5, width=.8, height=.7)) - grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1.5, "npc"), height = unit(1.5, "npc")) - venn.draw <- plot(cv, gp=gp) - grid.draw(venn.draw) - mtext("Chow Ruskey comparison of all peaks annotated with UROPA", side=3, line=0,outer=FALSE, cex=1) - }, error = function(e){ - cat("\nChowRuskey plot was invalid\n") - }) + if(requireNamespace("Vennerable", quietly = TRUE)){ + if(num.queries>2 && num.queries<=5){ + + suppressPackageStartupMessages(library(Vennerable)) + tryCatch({ + plot8 <- paste0("8: Comparison of all specified queries:", + "\n\nThe following Chow Ruskey plot compares all queries\nbased on the besthits.", + "\nIt represents an area-proportional Venn diagram,\nrevealing the distribution of peaks that could be annotated\nper query and works for up to 5 queries.", + "\n\nAdditional Info:\nIt is evalueated whether peaks are annotated,\nbut not whether they are annotated for the same feature") + grid.newpage() + mtext(plot8, cex=1, adj=0, padj=1) + + num <- venn(peaks.per.query)[,"num"] + if(num[length(num)] == 0){ + num <- num+1 + } + # compute distribution + v <- Venn(SetNames=paste0("query",queries), Weight=num) + cv <- compute.Venn(v, type="ChowRuskey", doWeights=TRUE) + + # change FaceText size and line thickness + gp <- VennThemes(cv, colourAlgorithm = "signature") + gp[["FaceText"]] <- lapply(gp[["FaceText"]],function(gps){gps$fontsize<-10;gps}) + gp[["Set"]] <- lapply(gp[["Set"]],function(gps){gps$lwd<-.8;gps}) + + #change label positions + SetLabels <- VennGetSetLabels(cv) + max.x <- max(VennGetUniverseRange(cv)[,"x"]) + max.y <- max(VennGetUniverseRange(cv)[,"y"]) + for(i in 1:nrow(SetLabels)){ + SetLabels[i,"x"] <- max.x+2 + SetLabels[i,"hjust"] <- "right" + if(i==1){ + SetLabels[i,"y"] <- max.y/3 + } else { + SetLabels[i,"y"] <- (as.numeric(SetLabels[i-1,"y"])-2)-(2.5*num.queries) + } + } + cv <- VennSetSetLabels(cv,SetLabels) + + # change face label positions - not yet implemented in original package + facelabel <- as.data.frame(VennGetFaceLabels(cv)) + face.labels <- nrow(facelabel) + for(i in 1:(face.labels-2)){ + facelabel[i,"x"] <- 1.15*facelabel[i,"x"] + facelabel[i,"y"] <- 1.15*facelabel[i,"y"] + } + cv <- VennSetFaceLabels(cv,facelabel) + + # draw plot + pushViewport(viewport(x=.5, y=0.5, width=.8, height=.7)) + grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1.5, "npc"), height = unit(1.5, "npc")) + venn.draw <- plot(cv, gp=gp) + grid.draw(venn.draw) + mtext("Chow Ruskey comparison of all peaks annotated with UROPA", side=3, line=0,outer=FALSE, cex=1) + }, error = function(e){ + cat("\nChowRuskey plot was invalid do upsetR plot\n") + library(UpSetR,quietly = TRUE) + upset(fromList(peaks.per.query), keep.order=TRUE,nintersects = NA,empty.intersections = "on",number.angles=35,mainbar.y.label = "Peak Intersections", sets.x.label = "Annotated Genes") + #nsets=(ncol(combn(num.queries,2))+num.queries) + }) + } + } else { + library(UpSetR,quietly = TRUE) + upset(fromList(peaks.per.query), keep.order=TRUE, nintersects = NA,empty.intersections = "on",number.angles=35,mainbar.y.label = "Gene Intersections", sets.x.label = "Annotated Genes") + } - dev.off() + invisible(dev.off()) }