Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
UROPA/summary.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
398 lines (354 sloc)
17.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# plot 1: Distance of features based on finalhits | |
# plot 2: Genomic location of annotated peaks based on finalhits | |
# plot 3 - only if more than one feature is present: Feature distribution across finalhits | |
## following plots only if more than one query is defined | |
# plot 4: Distance of query vs. feature based on best per query hits | |
# plot 5: Genomic location of annotated peaks based on best per query hits | |
# plot 6 - only if more than one feature is present: Feature distribution across best per query hits | |
# plot 7: pairwise comparison of queries with regard to all peaks based on best per query hits | |
# plot 8 - only if three to five queries are defined: Chow-Ruskey diagram based on best per query hits | |
# used packages | |
suppressPackageStartupMessages(library(ggplot2)) | |
suppressPackageStartupMessages(library(jsonlite)) | |
suppressPackageStartupMessages(library(gridExtra)) | |
suppressPackageStartupMessages(library(grid)) | |
# script gets arguments | |
args <- commandArgs(TRUE) | |
# basic information independent if there are 3 or 4 input arguments, used otherwhere as .basic.summary | |
num.features <- 0 | |
features <- c() | |
# reformat every row of the config.query file to string for cover page | |
.print.query <- function(row){ | |
r <- paste(unname(as.vector(unlist(row))), collapse=" ") | |
return(r) | |
} | |
# A helper function to define a region on the layout for the pie charts | |
.define_region <- function(f){ | |
if(f %% 2 != 0 ){ | |
grid.newpage() | |
pushViewport(viewport(layout = grid.layout(2, 1), width=1, height=1)) | |
viewport(layout.pos.row = 1, layout.pos.col = 1) | |
} else { | |
viewport(layout.pos.row = 2, layout.pos.col = 1) | |
} | |
} | |
# helper for plot 2 and 5 (counts the occurence of each loci for pie charts) | |
.plot.genomic.location.per.feature.helper <- function(f, df.uropa, pie.basic){ | |
feat <- features[f] | |
df.feature <- subset(df.uropa, df.uropa$feature==feat) | |
unique.loci <- sort(unique(df.feature$genomic_location)) | |
occurence.loci <- c() | |
for(j in 1:length(unique.loci)){ | |
loci <- as.character(unique.loci[j]) | |
occurence <- as.numeric(length(grep(loci, df.feature$genomic_location))) | |
occurence.loci <- c(occurence.loci, occurence) | |
} | |
df.pie.full <- data.frame(location=unique.loci, value=occurence.loci) | |
min.occurence <- round(nrow(df.feature)/100*0.1) | |
df.pie <- df.pie.full[df.pie.full$value>min.occurence,] | |
df.tmp <- df.pie.full[df.pie.full$value<=min.occurence,] | |
df.others <- data.frame(location="others", value=sum(df.tmp$value)) | |
if(sum(df.tmp$value)>min.occurence){ | |
df.pie <- rbind(df.pie,df.others) | |
} | |
# calculate % oc loci representation | |
perc <- round(df.pie$value/sum(df.pie$value)*100,1) | |
df.pie$location <- paste0(df.pie$location, " (",perc, "%)") | |
# now background | |
blank_theme <- theme_minimal()+ theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold")) | |
# title | |
main <- paste0("Genomic location of '",feat, "' across ",pie.basic) | |
pie <- ggplot(df.pie, aes(x="", y=value, fill=location)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + blank_theme + theme(axis.text.x=element_blank()) + geom_text(aes(y = value/length(value) + c(0, cumsum(value)[-length(value)]), label = rep("",nrow(df.pie))), size=2, nudge_x = 0.7, nudge_y = 0.7) + ggtitle(main) + theme(plot.title = element_text(size = 10, face = "bold", , vjust=-10)) | |
print(pie, vp=.define_region(f)) | |
} | |
# plot 2 and plot 5 | |
.plot.genomic.location.per.feature <- function(df.uropa,pie.basic){ | |
# layout of plot | |
# if there is more than one feature, there should two columns and the according number of rows | |
for(f in 1:num.features){ | |
.plot.genomic.location.per.feature.helper(f, df.uropa, pie.basic) | |
} | |
} | |
# plot 3 and 6 | |
.plot.feature.distribution <- function(df.uropa, header){ | |
if(num.features>1){ | |
occurence.features <- c() | |
for(f in 1:num.features){ | |
feat <- features[f] | |
o <- nrow(subset(df.uropa, df.uropa$feature==feat)) | |
occurence.features <- c(occurence.features,o) | |
} | |
names(occurence.features) <- features | |
barplot(occurence.features, ylab="occurence",main=header) | |
} | |
} | |
# load finalhits file, create coverpage, and calculate basic plots | |
.basic.summary <- function(final.hits, conf, out){ | |
pdf(file=out) | |
plot.new() | |
df.uropa.final <- read.table(final.hits, header=TRUE, sep="\t",stringsAsFactors = FALSE) | |
# number of peaks annoteted with uropa run | |
num.peaks <- length(unique(df.uropa.final$peak_id)) | |
# stats is based on annoted peaks -> remove na rows | |
df.uropa.final[,"distance"] <- as.numeric(df.uropa.final[,"distance"]) | |
df.uropa.final <- df.uropa.final[complete.cases(df.uropa.final),] | |
anno.peaks <- nrow(df.uropa.final) | |
# get infos from config for overview page | |
config <- fromJSON(conf) | |
config.query <- as.data.frame(config$queries) | |
config.cols <- colnames(config.query) | |
priority <- config$priority | |
# queries of uropa annotation run | |
queries <- sort.int(as.numeric(unique(df.uropa.final$query))) | |
num.queries <- length(queries) | |
queries <- sprintf("%02d", queries) | |
features <<- as.character(unique(df.uropa.final$feature)) | |
num.features <<- length(features) | |
if(grepl("T",priority) && num.queries>1){ | |
priority <- paste0(priority, "\nNote: No pairwise comparisons and Chow Ruskey plot (no overlaps)") | |
} | |
# add query to query data frame | |
config.query$query <- paste0("query",sprintf("%02d",0:(nrow(config.query)-1))) | |
config.query <- config.query[,c("query", "feature", "distance", "feature.anchor","internals", "strand","direction","filter.attribute", "attribute.value","show.attributes")] | |
# replaye "start,center,end" position by "any_pos" | |
config.query$feature.anchor <- sapply(config.query$feature.anchor, function(x) if(length(x)==3){return("any_pos")}else{return(x)}) | |
mtext("UROPA summary", side=3, line=0,outer=FALSE, cex=2) | |
mtext(paste0("There were ", num.peaks, " peaks in the input bed file,\nUROPA annotated ", anno.peaks, " peaks\n"),side=3, line=-3,outer=FALSE, cex=.7) | |
if(num.queries != nrow(config.query)){ | |
mtext(paste("Only queries", paste(queries, collapse=","), "are represented in the FinalHits", sep=" "), side=3, line=-5,outer=FALSE, cex=.7) | |
} | |
mytheme <- ttheme_default(core = list(fg_params=list(cex = 0.5)),colhead = list(fg_params=list(cex = 0.5)),rowhead = list(fg_params=list(cex = 0.5))) | |
config.query <- data.frame(lapply(config.query, as.character), stringsAsFactors=FALSE) | |
g <- tableGrob(format(config.query), theme=mytheme,rows=NULL) | |
grid.draw(g) | |
mtext(paste0("priority: ", priority), cex=.7,side=1, line=-2) | |
input <- paste("Input:",unlist(config$bed),collapse=" ") | |
anno <- paste("Anno:",unlist(config$gtf),collapse=" ") | |
input.anno <- paste(input,anno,sep="\n") | |
mtext(input.anno,cex=.6, side=1, line=0) | |
# plot 1 | |
# description | |
plot1 <- paste0("1. Distances of annotated peaks in finalhits:", | |
"\n\nThe following density plot displays the distance of anntotated peaks\nto its feature(s) based on the finalhits.\n\n", | |
"Additional Info:\nThis is independent of the number of queries,\nall features present in the finalhits are displayed.", | |
"\n\n\nNote on output files:\n", | |
"\nallhits: All candidate features resulting from any query\n(1 peak : x queries : y annotations)", | |
"\n\nbesthits: All candidate features resulting from any query\n(1 peak : x queries : 1 annotation)", | |
"\n\nfinalhits: Only the one best feature among all queries\n(1 peak : 1 query : 1 annotation)") | |
grid.newpage() | |
mtext(plot1, cex=1, adj=0, padj=1) | |
# plot | |
y.lim <- round(median(df.uropa.final[,"distance"]) + (max(df.uropa.final[,"distance"])/15)) | |
if(y.lim > max(df.uropa.final[,"distance"]) || y.lim > 10000){ | |
if(median(df.uropa.final[,"distance"]) + 5000 > max(df.uropa.final[,"distance"])){ | |
y.lim <- median(df.uropa.final[,"distance"]) | |
} else { | |
y.lim <- median(df.uropa.final[,"distance"]) + 5000 | |
} | |
} | |
density <- subset(df.uropa.final[,c("distance","feature")], (df.uropa.final[,"distance"]<y.lim)) | |
dpq <- qplot(distance,data=density, geom="density", color=feature, xlab = "Distance to feature", ylab = "Relative count") | |
print(dpq + ggtitle("Distance to features across finalhits")) | |
# plot 2 | |
# description | |
plot2 <- paste0("2. Relative locations of annotated peaks to features in finalhits:", | |
"\n\nThe following pie chart(s) illustrate the relativ location of the peaks\nin relation the respective annotated feature as represented in the finalhits", | |
"\nThe best feature found amoung all of the queries is used for this plot.", | |
"\n\nAdditional Info:\nThis is independent of the number of queries,\nall represented features of the finalhits are displayed.") | |
grid.newpage() | |
mtext(plot2 ,cex=1, adj=0, padj=1) | |
# plot | |
.plot.genomic.location.per.feature(df.uropa.final, "finalhits") | |
# plot 3 | |
# plot | |
if(num.features > 1){ | |
plot3 <- paste0("3. Allocation of available features in finalhits:", | |
"\n\nBar plot displaying the occurrence of the different features if there is", | |
"\nmore than one feature assigned for peak annotation based on the finalhits.", | |
"\nThe best annotation found amoung all of the queries is used for this plot.", | |
"\n\nAdditional Info:\nThis is independent of the number of queries;", | |
"\nall features present in the finalhits are displayed.", | |
"\n\nIf only one feature is present, this plot will be skipped.") | |
grid.newpage() | |
mtext(plot3, cex=1, adj=0, padj=1) | |
.plot.feature.distribution(df.uropa=df.uropa.final,header="Feature distribution across finalhits") | |
} | |
return(df.uropa.final) | |
} | |
# arg 1 = finalhits | |
# arg 2 = summery config | |
# arg 3 = output pdf | |
# arg 4 = besthits | |
if(length(args)==3){ | |
## basic summary -- only one query definded | |
df.uropa.final <- .basic.summary(args[1],args[2], args[3]) | |
dev.off() | |
} else if(length(args)==4){ | |
suppressPackageStartupMessages(library(VennDiagram)) | |
suppressPackageStartupMessages(library(gplots)) | |
## increased summary -- there is more than one query definded | |
df.uropa.final <- .basic.summary(args[1],args[2], args[3]) | |
df.uropa.best.per.query <- read.table(args[4], header=TRUE, sep="\t",stringsAsFactors = FALSE) | |
num.peaks <- length(unique(df.uropa.final$peak_id)) | |
df.uropa.best.per.query[,"distance"] <- as.numeric(df.uropa.best.per.query[,"distance"]) | |
df.uropa.best.per.query <- df.uropa.best.per.query[complete.cases(df.uropa.best.per.query),] | |
features <<- as.character(unique(df.uropa.best.per.query$feature)) | |
num.features <<- length(features) | |
queries <- sort.int(as.numeric(unique(df.uropa.best.per.query$query))) | |
queries <- sprintf("%02d", queries) | |
num.queries <- length(queries) | |
# plot 4 | |
# description | |
plot4 <- paste0("4. Distances of annotated peaks seperated for features and queries in besthits:", | |
"\n\nThe distribution of the distances per feature per query", | |
"\nis displayed in histograms based on the besthits.", | |
"\n\nAdditional Info:\nThis is dependent on the number of queries;", | |
"\nall features present in any query are displayed.") | |
grid.newpage() | |
mtext(plot4, cex=1, adj=0, padj=1) | |
# plot | |
# get max distance to calculate binwidth | |
dist <- df.uropa.best.per.query[,"distance"] | |
median.uropa.best <- median(dist) | |
max.uropa.best <- max(dist) | |
considered.distance <- median.uropa.best + round(max.uropa.best/15) | |
if(considered.distance > max.uropa.best || considered.distance>10000){ | |
if(median.uropa.best + 5000 > max.uropa.best){ | |
considered.distance <- median.uropa.best | |
} else { | |
considered.distance <- median.uropa.best + 5000 | |
} | |
} | |
df.distance.query <- subset(df.uropa.best.per.query[,c("feature","distance","query")], (df.uropa.best.per.query[,"distance"] < considered.distance)) | |
max.distance.query <- round(max(as.numeric(df.distance.query[,"distance"]))) | |
bin.width <- round(max.distance.query/20) | |
dpq <- qplot(df.distance.query[,2],data =df.distance.query, facets=query~feature, geom="histogram", binwidth=bin.width, xlab = "Distance to feature", ylab = "Total count") | |
print(dpq + ggtitle("Distance of query vs. feature across besthits Hits")) | |
# plot 5 | |
# description | |
plot5 <- paste0("5. Relative locations of annotated peaks in besthits:", | |
"\n\nThe following pie chart(s) illustrate the relativ location of\nthe peaks in relation the respective annotated feature as represented in the besthits.\n\n", | |
"Additional Info:\nThis is dependent on the number of queries;\nall features present in the besthits are displayed.") | |
grid.newpage() | |
mtext(plot5, cex=1, adj=0, padj=1) | |
# plot | |
.plot.genomic.location.per.feature(df.uropa.best.per.query, "besthits Hits") | |
# plot 6 | |
if(num.features > 1){ | |
plot6 <- paste0("6. Allocation of available featurs in besthits:", | |
"\n\nBar plot displaying the occurrence of the different features if there is more", | |
"\nthan one feature assigned for peak annotation based on the besthits.", | |
"\nThe best annotation found in each query is used for this plot", | |
"\n\nAdditional Info:\nThis is independent of the number of queries,\nall features present in the besthits are displayed.", | |
"\n\nIf only one feature is present,", | |
"\nthis plot will be skipped.") | |
grid.newpage() | |
mtext(plot6, cex=1, adj=0, padj=1) | |
.plot.feature.distribution(df.uropa=df.uropa.best.per.query,header="Feature distribution across besthits Hits") | |
} | |
# plot 7 | |
# description | |
plot7 <- paste0("7: Pairwise comparisons of query annotations:", | |
"\n\nThe following venn diagrams display peak based pairwise comparisons\namong all queries based on the besthits.", | |
"\n\nAdditional Info:\nIf only one query is present,", | |
"\nthis plot will be skipped.") | |
grid.newpage() | |
mtext(plot7, cex=1, adj=0, padj=1) | |
# get annotated peaks unique for each query | |
peaks.per.query <- list() | |
for(q in 1:num.queries){ | |
peaks.per.query[[q]] <- unique(df.uropa.best.per.query[as.numeric(df.uropa.best.per.query$query)==as.numeric(queries[q]),"peak_id"]) | |
names(peaks.per.query)[[q]] <- paste0("query",queries[q]) | |
} | |
# number of pairwise compares: (n-1)n/2 with n = # queries | |
# iterater for pairwise compare | |
tmp.num.query <- 1 | |
all.peaks <- unique(df.uropa.best.per.query$peak_id) | |
for(q in 1:(num.queries-1)){ | |
initial.query <- peaks.per.query[[paste0("query",queries[q])]] | |
if(tmp.num.query < num.queries){ | |
for(c in num.queries:tmp.num.query){ | |
if(queries[q] != queries[c]){ | |
compare.query <- peaks.per.query[[paste0("query",queries[c])]] | |
venn.object <- venn(list(initial.query, compare.query)) | |
pushViewport(viewport(x=.5, y=0.5, width=.72, height=.72)) | |
grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1, "npc"), height = unit(1, "npc")) | |
anno.initial.query <- as.numeric(venn.object["01","num"])+ as.numeric(venn.object["11","num"]) | |
anno.compare.query <- as.numeric(venn.object["10","num"])+ as.numeric(venn.object["11","num"]) | |
matches <- as.numeric(venn.object["11","num"]) | |
#grid.rect(fill="white") | |
if(anno.initial.query != matches && anno.compare.query != matches){ | |
dist <- 0.02 | |
} else if(anno.compare.query == matches || anno.initial.query ==matches){ | |
dist <- c(0.02, -0.02, -0.02) | |
} | |
venn.plot <- draw.triple.venn(num.peaks, anno.initial.query, anno.compare.query, anno.initial.query, matches, anno.compare.query, matches, category =c("all",paste0("query",queries[q]),paste0("query",queries[c])), | |
lwd = 2, lty ="solid", col = c("black", "red","blue"), fill = c("white","red","blue"), cex=1, cat.pos = c(0,0,0), cat.dist = dist, reverse = FALSE, | |
cat.cex =1, cat.default.pos= "outer", alpha = c(0,.4,.4), euler.d=TRUE, scaled=TRUE) | |
grid.draw(venn.plot) | |
mtext(paste0("Peak based pairwise compare of ", paste0("query",queries[q]), " and ", paste0("query",queries[c])), side=3, line=0,outer=FALSE, cex=1) | |
popViewport() | |
} | |
} | |
} | |
tmp.num.query <- tmp.num.query+1 | |
} | |
# plot 8 | |
# only up to five queries because of Vennerable package support | |
if(num.queries>2 && num.queries<=5){ | |
suppressPackageStartupMessages(library(Vennerable)) | |
tryCatch({ | |
plot8 <- paste0("8: Comparison of all specified queries:", | |
"\n\nThe following Chow Ruskey plot compares all queries\nbased on the besthits.", | |
"\nIt represents an area-proportional Venn diagram,\nrevealing the distribution of peaks that could be annotated\nper query and works for up to 5 queries.", | |
"\n\nAdditional Info:\nIt is evalueated whether peaks are annotated,\nbut not whether they are annotated for the same feature") | |
grid.newpage() | |
mtext(plot8, cex=1, adj=0, padj=1) | |
num <- venn(peaks.per.query)[,"num"] | |
if(num[length(num)] == 0){ | |
num <- num+1 | |
} | |
# compute distribution | |
v <- Venn(SetNames=paste0("query",queries), Weight=num) | |
cv <- compute.Venn(v, type="ChowRuskey", doWeights=TRUE) | |
# change FaceText size and line thickness | |
gp <- VennThemes(cv, colourAlgorithm = "signature") | |
gp[["FaceText"]] <- lapply(gp[["FaceText"]],function(gps){gps$fontsize<-10;gps}) | |
gp[["Set"]] <- lapply(gp[["Set"]],function(gps){gps$lwd<-.8;gps}) | |
#change label positions | |
SetLabels <- VennGetSetLabels(cv) | |
max.x <- max(VennGetUniverseRange(cv)[,"x"]) | |
max.y <- max(VennGetUniverseRange(cv)[,"y"]) | |
for(i in 1:nrow(SetLabels)){ | |
SetLabels[i,"x"] <- max.x+2 | |
SetLabels[i,"hjust"] <- "right" | |
if(i==1){ | |
SetLabels[i,"y"] <- max.y/3 | |
} else { | |
SetLabels[i,"y"] <- (as.numeric(SetLabels[i-1,"y"])-2)-(2.5*num.queries) | |
} | |
} | |
cv <- VennSetSetLabels(cv,SetLabels) | |
# change face label positions - not yet implemented in original package | |
facelabel <- as.data.frame(VennGetFaceLabels(cv)) | |
face.labels <- nrow(facelabel) | |
for(i in 1:(face.labels-2)){ | |
facelabel[i,"x"] <- 1.15*facelabel[i,"x"] | |
facelabel[i,"y"] <- 1.15*facelabel[i,"y"] | |
} | |
cv <- VennSetFaceLabels(cv,facelabel) | |
# draw plot | |
pushViewport(viewport(x=.5, y=0.5, width=.8, height=.7)) | |
grid.rect(gp=gpar(fill="white",lty = "blank"),width = unit(1.5, "npc"), height = unit(1.5, "npc")) | |
venn.draw <- plot(cv, gp=gp) | |
grid.draw(venn.draw) | |
mtext("Chow Ruskey comparison of all peaks annotated with UROPA", side=3, line=0,outer=FALSE, cex=1) | |
}, error = function(e){ | |
cat("\nChowRuskey plot was invalid\n") | |
}) | |
} | |
dev.off() | |
} else { | |
stop("Usage: Rscript summary.R finalhits.txt summary_config.json summary.pdf besthits.txt") | |
} |