Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env Rscript
##################################################################
## ##
## RobusTAD.R ##
## Rola Dali, Guillaume Bourque, Mathieu Blanchette ##
## Feb 2018 ##
## v1.0 ##
## ##
##################################################################
## The following script produces TAD Boundary scores for each bin on a chromosome.
## Input: interaction frequency matrix for a chromosome
## Output: 2 files: I- file with TAD Boundary scores: Right Boundary scores, Left Boundary scores and Final combined score (max(R, L)).
## II- file with TAD boundary calls identified by looking for local maxima above the set threshold
####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ settings:
options(scipen=999)
library("optparse")
####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ functions:
leftRightBoundaryScores <- function(data, minW, maxW, minRatio) {
## function that calculates left and right boundary scores for each bin
s = dim(data)[1]
datanorm <- matrix(0,dim(data)[1],dim(data)[2])
if(status == "raw"){
## normalize if data is raw, skip otherwise
print("... ... Normalizing Data")
sumrow <- matrix(0,dim(data)[1],1)
for (i in 1:s) {
sumrow[i,1] = sum(data[i,])
}
ssum = sum(sumrow)
for (i in 1:s) {
for (j in 1:s) {
datanorm[i,j]=data[i,j]/(sumrow[i,1]*sumrow[j,1])*ssum;
}
}
} else {
print("... ... skipping normalization step!")
datanorm <- data
}
print("... ... Calculating Boundary Scores")
leftBoundaryScore=matrix(1,s);
rightBoundaryScore=matrix(1,s);
for (i in (maxW+2):(s-maxW-2)) {
rightScore=0;
leftScore =0;
leftBoundaryScore[i+1,1]=-99999;
rightBoundaryScore[i+1,1]=-99999;
diag=matrix(NA,maxW,maxW);
for (d in 1:maxW) {
maxiPossible = (d^3 + 3 * d^2 + 2* d)/6;
for (j in 1:d) {
diag[d,j] = datanorm[i-j+1,i-j+1+d];
}
for (del in 1:d) {
diff = sum(datanorm[i-d,i-d+del]/(diag[del,]+0.001)>minRatio, na.rm=T) - sum(datanorm[i-d,i-d+del]/(diag[del,]+0.001)<1.0/minRatio, na.rm=T)
rightScore = rightScore + diff;
}
if (d>=minW) {
z = rightScore/sqrt(maxiPossible);
# note: the score calculated pertains to the presence of a boundary on the right end of bin i. However it is more convenient to report scores relating to the left end of bins, so we associate this score to bin i+1 instead of i.
if (z>rightBoundaryScore[i+1,1]) {
rightBoundaryScore[i+1,1]=z;
}
}
for (del in 1:d) {
diff = sum(datanorm[i+1+d-del,i+1+d]/(diag[del,]+0.001)>minRatio, na.rm=T) - sum(datanorm[i+1+d-del,i+1+d]/(diag[del,]+0.001)<1.0/minRatio, na.rm=T)
leftScore = leftScore + diff;
}
if (d>=minW) {
z = leftScore/sqrt(maxiPossible);
# note: the score calculated pertains to the presence of a boundary on the right end of bin i. However it is more convenient to report scores relating to the left end of bins, so we associate this score to bin i+1 instead of i.
if (z>leftBoundaryScore[i+1,1]) leftBoundaryScore[i+1,1]=z;
}
}
}
return(cbind(leftBoundaryScore,rightBoundaryScore));
}
localpeaks <- function(score){
## locates peaks across the TAD score profile
index <- c()
for (i in 3:(length(score)-2)){
if(score[i] > score[i-2] && score[i] > score[i-1] && score[i] > score[i+1] && score[i] > score[i+2]){
index <- c(index, i)
}
}
return(index)
}
TADBoundcalls <- function(boundaries, threshold){
## calls TAD Boundaries using a threshold T by locating the maxima that are above the threshold
BoundIndexR <- localpeaks(boundaries[,3])
BoundIndexL <- localpeaks(boundaries[,2])
BoundIndex <- unique(sort(c(BoundIndexR, BoundIndexL)))
Boundmaxima <- boundaries[BoundIndex,]
TADcalls <- Boundmaxima[Boundmaxima[,4] >= threshold,]
return(TADcalls)
}
####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ main:
## parse command line arguments:
option_list = list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="Interaction Frequency Matrix. Must be a square matrix: number of columns = number of rows"),
make_option(c("-H", "--header"), action = "store_true", default=TRUE,
help="include -H if input contains a header/column names"),
make_option(c("-n", "--norm"), action = "store", default ="raw",
help="indicates if IF matrix is raw or normalized [default = %default]; [options: {raw, norm}]"),
make_option(c("-o", "--outDir"), type="character", default=".",
help="output directory name"),
make_option(c("-b", "--binsize"), type="integer", default=50,
help="binsize or resolution used in Hi-C analysis in kb [default = %default]"),
make_option(c("-r", "--minRatio"), type="double", default=1.5,
help="minimum ratio of Within to Across IF values to contribute to boundary score calculation [default = %default]"),
make_option(c("-w", "--minWin"), type="integer", default=250,
help="minimum window around the bin used to calculate the TAD score in kb [default = %default]"),
make_option(c("-W", "--maxWin"), type="integer", default=500,
help="maximum window around the bin used to calculate the TAD score in kb [default = %default]"),
make_option(c("-T", "--threshold"), type="double", default=0.2,
help="data percentile of TAD scores used to calculate threshold in order to call significant TAD boundaries. [default = %default]; [options: 0-1];\n\t\tthe lower the threshold, the more stringent the TAD calls")
);
description = "\n\n==============================================================================================================================================================\n\nRobusTAD calculates TAD Boundary scores for each bin on a chromosome.\n\tInput: interaction frequency matrix for a chromosome.\n\tOutput: 2 files: \n\t\t\tI- file with TAD boundary scores (BoundaryScores_*): contains Right Boundary scores, Left Boundary scores and Final combined score (max(R, L)).\n\t\t\tII- file with TAD boundary calls (TADBoundaryCalls_*) identified by looking for local maxima above the set threshod.\n\nUsage: RobusTAD.R -i InputMatrix [options]\n\n==============================================================================================================================================================\n\n"
epilogue = "\n\n==============================================================================================================================================================\n\nRobusTAD is available under a GPL liscence and comes with no warranties @ https://github.com/rdali/RobusTAD\n\n==============================================================================================================================================================\n\n"
opt_parser = OptionParser(option_list=option_list, description = description, epilogue=epilogue);
opt = parse_args(opt_parser);
if (is.null(opt$input)){
print_help(opt_parser)
stop("Error! missing arguments... Interaction frequency matrix is required!\n", call.=FALSE)
}
mymatrix <- opt$input
minRatio <- as.numeric(opt$minRatio)
binSize <- as.numeric(opt$binsize)
minWin <- as.numeric(opt$minWin)
maxWin <- as.numeric(opt$maxWin)
perc <- as.numeric(opt$threshold)
outDir <- opt$outDir
status <- opt$norm
header <- opt$header
if(!file.exists(mymatrix)){
stop ("Error! provided matrix file does not exist")
}
##create outDir if it doesnt exist:
if (!file.exists(outDir)) {
dir.create(outDir)
}
if(status != "raw" & status != "norm" ){
stop ("Error! -n must be set to 'raw' or 'norm'")
}
if(perc < 0 | perc > 1){
stop ("Error! --threshold should be set to a value between 0 and 1!")
}
minW <- as.integer(minWin/binSize)
maxW <- as.integer(maxWin/binSize)
## extract base name of matrix to use in output files:
prefix <- basename(mymatrix)
prefix <- substr(prefix, 1, nchar(prefix)-4)
## read in matrix
if(header){
data <- as.matrix(read.table(mymatrix, header = TRUE, sep="\t", row.names = 1, as.is=TRUE))
} else {
data <- as.matrix(read.table(mymatrix, header = FALSE, sep="\t", as.is=TRUE))
}
## check that input file is a square matrix:
if(nrow(data) != ncol(data)){
stop ("Error! provided matrix is not a square matrix!")
}
boundaries = leftRightBoundaryScores(data, minW, maxW, minRatio)
rownames(boundaries) <- rownames(data)
colnames(boundaries) <- c("LeftBoundaryScore", "RightBoundaryScore")
## scale data to produce z-scores:
boundaries <- scale(boundaries, center = T, scale = T)
## calculate combined TAD score:
TADscore <- apply(boundaries, 1, max)
boundaries <- data.frame(boundaries, TADscore)
coordinates <- rownames(boundaries)
boundaries <- data.frame(boundaries, coordinates)
boundaries <- boundaries[,c(4,1,2,3)]
## call TAD boundaries:
print("... ... Calling Significant TAD Boundaries")
threshold <- quantile(boundaries$TADscore, na.rm = T, probs = (1-perc))[[1]]
TADcalls <- TADBoundcalls(boundaries, threshold)
## Save files:
write.table(boundaries, file.path(outDir, paste0("BoundaryScores_", prefix, "_binSize",binSize ,"_minW", minWin, "_maxW", maxWin,"_minRatio", minRatio,".txt")), quote = F, row.names=F)
write.table(TADcalls, file.path(outDir, paste0("TADBoundaryCalls_", prefix, "_binSize",binSize ,"_minW", minWin, "_maxW", maxWin,"_minRatio", minRatio,"_threshold",perc, ".txt")), quote = F, row.names=F)
print("... ... RobusTAD run is complete!")