diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..508df78 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.Rproj.user +.Rhistory diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..63122c4 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,18 @@ +Package: DPM +Type: Package +Title: Distance Precision Matrix +Version: 0.1.0 +Author: Mahsa Ghanbari, Julia lasserre +Maintainer: Mahsa Ghanbari +Description: It computes DPM and regularized DPM +License: MIT + file LICENSE +Imports: + corpcor +Suggests: + mvtnorm, + pcalg, + qpgraph, + reshape2 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.0.1 diff --git a/DPM.Rproj b/DPM.Rproj new file mode 100644 index 0000000..f0d6187 --- /dev/null +++ b/DPM.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace,vignette diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..f59fae7 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,10 @@ +# Generated by roxygen2: do not edit by hand + +export(Dcenter) +export(dpm) +export(fisher_test) +export(get_link) +export(nldata) +export(rdata) +export(reg.dpm) +import(corpcor) diff --git a/R/Dcenter.R b/R/Dcenter.R new file mode 100644 index 0000000..1d03a62 --- /dev/null +++ b/R/Dcenter.R @@ -0,0 +1,28 @@ +#' Function to compute double-centred distance matrices +#' +#' @param x is a vector containing the values of a variable +#' +#' @return Dcentered vactor for x +#' +#' @examples +#' Dcenter(c(1,2,3,4)) +#' +#' @export + +############################################################ +### Function to compute double-centred distance matrices ### +############################################################ + +Dcenter <- function(x) { + # x is a vector containing the values of a variable + + n <- length(x) # number of points in dimension of x + + # compute distance matrix a + a <- outer(x, x, function(x1, x2) abs(x1 - x2)) + + # normalise a + ahat <- a - matrix(rowMeans(a), nrow = n, ncol = n) - matrix(colMeans(a), nrow = n, ncol = n, byrow = TRUE) + mean(a) + + return(ahat) +} diff --git a/R/dpm.R b/R/dpm.R new file mode 100644 index 0000000..56acd57 --- /dev/null +++ b/R/dpm.R @@ -0,0 +1,35 @@ +#' Function to compute DPM +#' +#' @param data is a matrix containing samples in rows, variables in columns +#' +#' @return a matrix with score for every edge +#' +#' @import corpcor +#' +#' @examples +#' ## Generate non-linear data +#' data <- nldata(100,0.2)$data +#' +#'## run DPM +#' dpm(data) +#' +#' @export + +############################### +### Function to compute DPM ### +############################### + +dpm <- function(data) { + # data is a matrix containing samples in rows, variables in columns + + # compute the double-centred vectors of each variable and concatenate them into the columns of matrix d + d <- apply(as.matrix(data), 2, Dcenter) + + # compute the precision matrix of d (cor2pcor is a function from corpcor Library) + res <- corpcor::cor2pcor(cov(d)) + + return(res) +} + + + diff --git a/R/fisher_test.R b/R/fisher_test.R new file mode 100644 index 0000000..f27faf6 --- /dev/null +++ b/R/fisher_test.R @@ -0,0 +1,37 @@ +#' Function to compute fisher z transformation pvalues +#' +#' @param p a +#' @param alpha a +#' @param nsamp is the number of required samples +#' @param nnodes is the number of nodes in the simulated graph +#' +#' @return a list contains the pvalues (pval) and correspending decisions of having a significant link (isnotnull) for all links +#' +#' @examples +#' ## Generate non-linear data +#' data <- nldata(100,0.2)$data +#' +#'## Run regularized DPM +#' res <- reg.dpm(data) +#' +#' ## Compute pvalues +#' fisher_test(res, 0.05,) +#' +#' @export + +########################################################### +### Function to compute fisher z transformation pvalues ### +########################################################### + +fisher_test <- function(p, alpha, nsamp, nnodes){ + z = sqrt(nsamp-(nnodes-2)-3) * 0.5 * abs(log((1 + p) / (1 - p))) + pval = 2 * pnorm(-z) + pval_tr = qnorm(1-alpha/2) + isnotnull = (z > pval_tr ) + temp = exp( 2 * pval_tr / (sqrt(nsamp-(nnodes-2)-3)) ) + threshold = (temp -1)/(temp + 1) + return(list(pval=pval, isnotnull=isnotnull,threshold=threshold)) +} + + + diff --git a/R/get_link.R b/R/get_link.R new file mode 100644 index 0000000..5b0908b --- /dev/null +++ b/R/get_link.R @@ -0,0 +1,49 @@ +#' Converts the score matrix (obtained from DPM or reg-DPM) to a sorted list of non-directed links (links with higher score first). +#' +#' @param score_matrix score matrix obtained from DPM or regDPM. +#' @param top_ranked number of top links with higher score to report. The default value is NULL, i.e. all the links are reported. +#' @param threshold only links with a score above the threshold are reported. The default value is 0, i.e. all the links are reported. +#' +#' @return List of links in a data frame. Each line of the data frame corresponds to a link. The first and second columns are the corresponding nodes of the link and the third column is the score of the link. +#' +#' @examples +#'## Generate linear data with 10 nodes and 100 samples +#' data <- rdata(nnodes=10,nsamp=100)$data +#' rownames(data) <- paste("Sample", 1:100, sep="") +#' colnames(data) <- paste("Node", 1:10, sep="") +#' +#' ## Run reg_DPM +#' score_mat <- reg.dpm(data) +#' +#' ## Get ranking of link +#' links <- get_link(score_mat) +#' head(linkList) +#' +#' ## Get only top ranked links +#' links <- get_link(score_mat,top_ranked=10) +#' +#' ## Get only the links with a score higher than a threshold +#' links <- get_link(score_mat,threshold = 0.2) +#' @export +#' +#' +#################################################################################### +### Function to converts the score matrix to a sorted list of non-directed links#### +#################################################################################### + +get_link <- function(score_matrix, top_ranked=NULL, threshold=0) { + diag(score_matrix) <- NA + score_matrix[upper.tri(score_matrix)] <- NA + linkList <- reshape2::melt(abs(score_matrix), na.rm=TRUE) + colnames(linkList) <- c("Node1", "Node2", "score") + linkList <- linkList[linkList$score>=threshold,] + linkList <- linkList[order(linkList$score, decreasing=TRUE),] + + if(!is.null(top_ranked)) { + linkList <- linkList[1:min(nrow(linkList), top_ranked),] + } + + rownames(linkList) <- NULL + + return(linkList) +} diff --git a/R/nldata.R b/R/nldata.R new file mode 100644 index 0000000..bc2f4bb --- /dev/null +++ b/R/nldata.R @@ -0,0 +1,69 @@ +#' Function to generate simulated nonlinear data +#' +#' @param nsamp is the number of required samples +#' @param sigma is the standard deviation of the noise added to the generated data +#' +#' @return a list of a matrix containing samples in rows, variables in columns and the adjacency matrix of the network +#' +#' @examples +#' ## Generate non-linear data with 100 samples and the standard deviation of 0.2 for the noise added to the generated data +#' dataplus <- nldata(nsamp=100,sigma=0.2) +#' +#' ## data +#' data <- dataplus$data +#' +#' ## gold standard +#' gs <- dataplus$gs +#' +#' +#' @export + + +####################################################### +### Function to generate simulated non-linear data #### +####################################################### + +nldata <- function(nsamp,sigma) { + # nsamp is the number of required samples + # sigma is the standard deviation of the noise added to the generated data + + # graph used to generate non-linear data + gs <- matrix(c(1,0,1,1,0,0,0,0,0,0,0, + 0,1,0,0,1,0,0,0,0,0,1, + 1,0,1,0,0,1,1,0,0,0,0, + 1,0,0,1,0,0,0,1,0,0,0, + 0,1,0,0,1,0,0,1,1,0,0, + 0,0,1,0,0,1,1,0,0,0,0, + 0,0,1,0,0,1,1,0,0,0,0, + 0,0,0,1,1,0,0,1,0,1,0, + 0,0,0,0,1,0,0,0,1,0,1, + 0,0,0,0,0,0,0,1,0,1,0, + 0,1,0,0,0,0,0,0,1,0,1), nrow=11, ncol=11) + + # build data containers and fill the first two nodes with random data + netdata = data.frame(A = runif(nsamp,-2,2), + B = runif(nsamp,-2,2), + C = rep(0, nsamp), + D = rep(0, nsamp), + E = rep(0, nsamp), + F = rep(0, nsamp), + G = rep(0, nsamp), + H = rep(0, nsamp), + I = rep(0, nsamp), + J = rep(0, nsamp), + K = rep(0, nsamp)) + + # build the rest of the data using non linear equations + netdata$C = 5 * netdata$A^2 -3 + rnorm(nsamp, 0, sigma) + netdata$D = 3 * netdata$A^3 - 5*netdata$A + 5 + rnorm(nsamp, 0, sigma) + netdata$E = (2*netdata$B+0.5)^2 - 4 + rnorm(nsamp, 0, sigma) + netdata$F = 2 * log(abs(netdata$C-6)) + 1 + rnorm(nsamp, 0, sigma) + netdata$G = 0.2 * (netdata$C-6)^2 + 0.4 * abs(netdata$F)^(1/2) - 14 + rnorm(nsamp, 0, sigma) + netdata$H = 0.4 * (netdata$D/2)^2 - 0.3 * (netdata$E/5)^3 - 1.5 * netdata$E - 10 + rnorm(nsamp, 0, sigma) + netdata$I = 0.4 * (netdata$E-5)^(2) - 0.5 * (netdata$E-5) - 20 + rnorm(nsamp, 0, sigma) + netdata$J = 0.2 * (netdata$H/10)^4 + 1.3 * (netdata$H/10)^3 + 0.4 *(netdata$H/10)^2 - 5 + rnorm(nsamp, 0, sigma) + netdata$K = 0.6 * (netdata$I/5)^2 - 0.8 * netdata$B^(3) - 3 + rnorm(nsamp, 0, sigma) + + return(list(data=as.matrix(netdata),gs=gs)) +} + diff --git a/R/rdata.R b/R/rdata.R new file mode 100644 index 0000000..526fd73 --- /dev/null +++ b/R/rdata.R @@ -0,0 +1,71 @@ +#' Function to generate simulated nonlinear data +#' +#' @param nnodes is the number of nodes in the simulated graph +#' @param nsamp is the number of required samples +#' @param prob is the probability +#' +#' @return a matrix containing samples in rows, variables in columns +#' +#' @examples +#' ## Generate linear data with 5 nodes and 100 samples +#' dataplus <- rdata(nnodes=5,nsamp=100) +#' +#' ##data +#' data <- dataplus$data +#' +#' ## undirected gold standard +#' gs <- dataplus$adj +#' +#' ## directed gold standard +#' gs <- dataplus$dag +#' +#' @export + +################################################### +### Function to generate simulated linear data #### +################################################### + +rdata <- function(nnodes, nsamp, prob=0.2) { + # nnodes is the number of nodes in the simulated graph + # nsamp is the number of required samples + + # required libraries + #library("qpgraph") + #library("pcalg") + + # create a random graph + g <- pcalg::randomDAG(nnodes, prob=prob, lB=1, uB=1) + + # add edges if necessary, so that all nodes are connected to at least one other node + a <- as(g,"matrix") + if (sum(a[1,2:nnodes]) == 0) { # if the first node has no children + a[1,sample(2:nnodes,1)] = 1 # add a child + } + for (i in 2:(nnodes-1)) { + if ((sum(a[i,(i+1):nnodes]) == 0) & (sum(a[1:(i-1),i]) == 0)) { # if neither any parents nor any children + if (runif(1) <= (nnodes-i)/(nnodes-1)) { # draw a coin + a[i,sample((i+1):nnodes,1)] = 1 # heads, add a child + } else { + a[sample(1:(i-1)),i] = 1 # tails, add a parent + } + } + } + if (sum(a[1:(nnodes-1),nnodes]) == 0) { # if the last node has no parents + a[sample(1:(nnodes-1),1),nnodes] = 1 # add a parent + } + g = pcalg::getGraph(a) + g = as(g,"graphNEL") + + # create the undirected graph + undirected_g <- a + t(a) + + # create a covariance matrix based on this graph + Sigma<-qpgraph::qpG2Sigma(undirected_g, rho=0.5) + + # generate Gaussian data with 0 mean using the covariance matrix + data<-mvtnorm::rmvnorm(nsamp,mean = rep(0, nrow(Sigma)),sigma=as.matrix(Sigma)) + colnames(data)<-as.character(1:nnodes) + + return(list(data=data, covmat=Sigma, adj=undirected_g, dag=g)) +} + diff --git a/R/reg.dpm.R b/R/reg.dpm.R new file mode 100644 index 0000000..2d411ae --- /dev/null +++ b/R/reg.dpm.R @@ -0,0 +1,33 @@ +#' Function to compute regularized DPM +#' +#' @param data is a matrix containing samples in rows, variables in columns +#' +#' @return a matrix with score for every edge +#' +#' @import corpcor +#' +#' @examples +#' ## Generate non-linear data with 100 samples and the standard deviation of 0.2 of the noise added to the generated data +#' data <- nldata(nsamp=100,sigma=0.2)$data +#' +#'## run regularized DPM +#' reg.dpm(data) +#' +#' @export + + +########################################### +### Function to compute regularized DPM ### +########################################### + +reg.dpm <- function(data) { + # data is a matrix containing samples in rows, variables in columns + + # compute the double-centred vectors of each variable and concatenate them into the columns of matrix d + d <- apply(as.matrix(data), 2, Dcenter) + + # compute the regularised precision matrix of d + res <- corpcor::pcor.shrink(d) + + return(res) +} diff --git a/man/Dcenter.Rd b/man/Dcenter.Rd new file mode 100644 index 0000000..b080b8c --- /dev/null +++ b/man/Dcenter.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Dcenter.R +\name{Dcenter} +\alias{Dcenter} +\title{Function to compute double-centred distance matrices} +\usage{ +Dcenter(x) +} +\arguments{ +\item{x}{is a vector containing the values of a variable} +} +\value{ +Dcentered vactor for x +} +\description{ +Function to compute double-centred distance matrices +} +\examples{ +Dcenter(c(1,2,3,4)) + +} diff --git a/man/dpm.Rd b/man/dpm.Rd new file mode 100644 index 0000000..2741b56 --- /dev/null +++ b/man/dpm.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dpm.R +\name{dpm} +\alias{dpm} +\title{Function to compute DPM} +\usage{ +dpm(data) +} +\arguments{ +\item{data}{is a matrix containing samples in rows, variables in columns} +} +\value{ +a matrix with score for every edge +} +\description{ +Function to compute DPM +} +\examples{ +## Generate non-linear data +data <- nldata(100,0.2)$data + +## run DPM +dpm(data) + +} diff --git a/man/fisher_test.Rd b/man/fisher_test.Rd new file mode 100644 index 0000000..4d29372 --- /dev/null +++ b/man/fisher_test.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fisher_test.R +\name{fisher_test} +\alias{fisher_test} +\title{Function to compute fisher z transformation pvalues} +\usage{ +fisher_test(p, alpha, nsamp, nnodes) +} +\arguments{ +\item{p}{a} + +\item{alpha}{a} + +\item{nsamp}{is the number of required samples} + +\item{nnodes}{is the number of nodes in the simulated graph} +} +\value{ +a list contains the pvalues (pval) and correspending decisions of having a significant link (isnotnull) for all links +} +\description{ +Function to compute fisher z transformation pvalues +} +\examples{ +## Generate non-linear data +data <- nldata(100,0.2)$data + +## Run regularized DPM +res <- reg.dpm(data) + +## Compute pvalues +fisher_test(res, 0.05,) + +} diff --git a/man/get_link.Rd b/man/get_link.Rd new file mode 100644 index 0000000..2e85f34 --- /dev/null +++ b/man/get_link.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_link.R +\name{get_link} +\alias{get_link} +\title{Converts the score matrix (obtained from DPM or reg-DPM) to a sorted list of non-directed links (links with higher score first).} +\usage{ +get_link(score_matrix, top_ranked = NULL, threshold = 0) +} +\arguments{ +\item{score_matrix}{score matrix obtained from DPM or regDPM.} + +\item{top_ranked}{number of top links with higher score to report. The default value is NULL, i.e. all the links are reported.} + +\item{threshold}{only links with a score above the threshold are reported. The default value is 0, i.e. all the links are reported.} +} +\value{ +List of links in a data frame. Each line of the data frame corresponds to a link. The first and second columns are the corresponding nodes of the link and the third column is the score of the link. +} +\description{ +Converts the score matrix (obtained from DPM or reg-DPM) to a sorted list of non-directed links (links with higher score first). +} +\examples{ +## Generate linear data with 10 nodes and 100 samples +data <- rdata(nnodes=10,nsamp=100)$data +rownames(data) <- paste("Sample", 1:100, sep="") +colnames(data) <- paste("Node", 1:10, sep="") + +## Run reg_DPM +score_mat <- reg.dpm(data) + +## Get ranking of link +links <- get_link(score_mat) +head(linkList) + +## Get only top ranked links +links <- get_link(score_mat,top_ranked=10) + +## Get only the links with a score higher than a threshold +links <- get_link(score_mat,threshold = 0.2) +} diff --git a/man/nldata.Rd b/man/nldata.Rd new file mode 100644 index 0000000..f797fda --- /dev/null +++ b/man/nldata.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nldata.R +\name{nldata} +\alias{nldata} +\title{Function to generate simulated nonlinear data} +\usage{ +nldata(nsamp, sigma) +} +\arguments{ +\item{nsamp}{is the number of required samples} + +\item{sigma}{is the standard deviation of the noise added to the generated data} +} +\value{ +a list of a matrix containing samples in rows, variables in columns and the adjacency matrix of the network +} +\description{ +Function to generate simulated nonlinear data +} +\examples{ +## Generate non-linear data with 100 samples and the standard deviation of 0.2 for the noise added to the generated data +dataplus <- nldata(nsamp=100,sigma=0.2) + +## data +data <- dataplus$data + +## gold standard +gs <- dataplus$gs + + +} diff --git a/man/rdata.Rd b/man/rdata.Rd new file mode 100644 index 0000000..1d80191 --- /dev/null +++ b/man/rdata.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rdata.R +\name{rdata} +\alias{rdata} +\title{Function to generate simulated nonlinear data} +\usage{ +rdata(nnodes, nsamp, prob = 0.2) +} +\arguments{ +\item{nnodes}{is the number of nodes in the simulated graph} + +\item{nsamp}{is the number of required samples} + +\item{prob}{is the probability} +} +\value{ +a matrix containing samples in rows, variables in columns +} +\description{ +Function to generate simulated nonlinear data +} +\examples{ +## Generate linear data with 5 nodes and 100 samples +dataplus <- rdata(nnodes=5,nsamp=100) + +##data +data <- dataplus$data + +## undirected gold standard +gs <- dataplus$adj + +## directed gold standard +gs <- dataplus$dag + +} diff --git a/man/reg.dpm.Rd b/man/reg.dpm.Rd new file mode 100644 index 0000000..ff5040a --- /dev/null +++ b/man/reg.dpm.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reg.dpm.R +\name{reg.dpm} +\alias{reg.dpm} +\title{Function to compute regularized DPM} +\usage{ +reg.dpm(data) +} +\arguments{ +\item{data}{is a matrix containing samples in rows, variables in columns} +} +\value{ +a matrix with score for every edge +} +\description{ +Function to compute regularized DPM +} +\examples{ +## Generate non-linear data with 100 samples and the standard deviation of 0.2 of the noise added to the generated data +data <- nldata(nsamp=100,sigma=0.2)$data + +## run regularized DPM +reg.dpm(data) + +}