Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
coTRaCTE/.Rapp.history
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
461 lines (461 sloc)
20.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
expr.dat | |
factors.expr <- factors | |
for (expr.dat in expres.list[1:2,1]){# | |
gene.expr <- read.delim(file.path("expression/H1-hESC",expr.dat),header=TRUE,as.is=TRUE)# | |
# define the gene id without the numbers after . to be matched with the matrix2gene ids# | |
gene.expr$gene_id_simpl <- gsub("\\..*","",gene.expr$gene_id)# | |
# remove genes with weird ENSEMBL IDs# | |
#gene.expr <- gene.expr[grep("^ENSG",gene.expr$gene_id),]# | |
factors.expr <- merge(x=factors, y=gene.expr[,c("gene_id_simpl","TPM")], by.x="ensembl_id", by.y="gene_id_simpl",all.x=TRUE,suffixes=c("",sub(".tsv","",expr.dat)))# | |
} | |
head(factors.expr) | |
expr.dat=expres.list[5,1] | |
gene.expr <- read.delim(file.path("expression/H1-hESC",expr.dat),header=TRUE,as.is=TRUE)# | |
# define the gene id without the numbers after . to be matched with the matrix2gene ids# | |
gene.expr$gene_id_simpl <- gsub("\\..*","",gene.expr$gene_id)# | |
# remove genes with weird ENSEMBL IDs# | |
#gene.expr <- gene.expr[grep("^ENSG",gene.expr$gene_id),]# | |
factors.expr <- merge(x=factors.expr, y=gene.expr[,c("gene_id_simpl","TPM")], by.x="ensembl_id", by.y="gene_id_simpl",all.x=TRUE,suffixes=c("",sub(".tsv","",expr.dat))) | |
head(factors.expr) | |
dim(factors.expr) | |
dim(factors) | |
paste0(".",sub(".tsv","",expr.dat)) | |
factors.expr <- factors# | |
for (expr.dat in expres.list[,1]){# | |
gene.expr <- read.delim(file.path("expression/H1-hESC",expr.dat),header=TRUE,as.is=TRUE)# | |
# define the gene id without the numbers after . to be matched with the matrix2gene ids# | |
gene.expr$gene_id_simpl <- gsub("\\..*","",gene.expr$gene_id)# | |
# remove genes with weird ENSEMBL IDs# | |
#gene.expr <- gene.expr[grep("^ENSG",gene.expr$gene_id),]# | |
factors.expr <- merge(x=factors.expr, y=gene.expr[,c("gene_id_simpl","TPM")], by.x="ensembl_id", by.y="gene_id_simpl", all.x=TRUE, suffixes=c("",paste0(".",sub(".tsv","",expr.dat))))# | |
} | |
dim(factors.expr) | |
head(factors.expr) | |
gene.expr=NULL | |
write.table("candidates/encode/ESC_expression_matrices_1e-10.dat",sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE) | |
write.table(factors.expr,"candidates/encode/ESC_expression_matrices_1e-10.dat",sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE) | |
head(cand) | |
rownames(factors.expr) <- factors.expr$x | |
f1 <- cand[i,TF1] | |
i=1 | |
f1 <- cand[i,TF1] | |
f1 <- cand[i,"TF1"] | |
f2 <- cand[i,"TF2"] | |
match(f1,factors.expr$x) | |
f1 | |
f1 <- as.character(cand[i,"TF1"]) | |
match(f1,factors.expr$x) | |
f1 | |
i=3 | |
f1 <- as.character(cand[i,"TF1"]) | |
match(f1,factors.expr$x) | |
factors.expr[301,] | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x) | |
id.f1 | |
id.f2 | |
j1=id.f1 | |
j2=id.f2 | |
factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))] | |
cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))])) | |
plot(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))])) | |
cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))[1,2] | |
cand$cor <- c() | |
for (i in 1:nrow(cand)){# | |
f1 <- as.character(cand[i,"TF1"])# | |
f2 <- as.character(cand[i,"TF2"])# | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x)# | |
if (!is.na(id.f1)&!is.na(id.f2)){# | |
cr <- c()# | |
for (j1 in 1:length(id.f1)){# | |
for (j2 in 1:length(id.f2)){# | |
cr <- c(cr,cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))[1,2])# | |
}# | |
}# | |
cand[i,"cor"] <- mean(cr,na.rm=TRUE) # | |
}# | |
} | |
cand$cor | |
i=2 | |
f1 <- as.character(cand[i,"TF1"])# | |
f2 <- as.character(cand[i,"TF2"])# | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x) | |
f1 | |
f2 | |
id.f1 | |
id.f2 | |
(!is.na(id.f1)&!is.na(id.f2)) | |
cr <- c() | |
cr <- c()# | |
for (j1 in 1:length(id.f1)){# | |
for (j2 in 1:length(id.f2)){# | |
cr <- c(cr,cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))[1,2])# | |
print(cr)# | |
}# | |
} | |
cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))[1,2] | |
cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))])) | |
factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))] | |
j1 | |
cr <- c()# | |
for (j1 in (id.f1)){# | |
for (j2 in (id.f2)){# | |
cr <- c(cr,cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))[1,2])# | |
print(cr)# | |
}# | |
} | |
cand$cor <- c()# | |
for (i in 1:nrow(cand)){# | |
f1 <- as.character(cand[i,"TF1"])# | |
f2 <- as.character(cand[i,"TF2"])# | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x)# | |
if (!is.na(id.f1)&!is.na(id.f2)){# | |
cr <- c()# | |
for (j1 in (id.f1)){# | |
for (j2 in (id.f2)){# | |
cr <- c(cr,cor(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))[1,2])# | |
print(cr)# | |
}# | |
}# | |
cand[i,"cor"] <- mean(cr,na.rm=TRUE) # | |
}# | |
} | |
warnings() | |
cand$cor | |
i=127 | |
f1 <- as.character(cand[i,"TF1"])# | |
f2 <- as.character(cand[i,"TF2"])# | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x) | |
f1 | |
f2 | |
id.f1 | |
id.f2 | |
cr <- c() | |
j1=id.f1 | |
j2=id.f2 | |
factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))] | |
hist(cand$cor) | |
cand[cand$cor<(-0.5),] | |
subset(cand,(!is.na(cand$cor)&(cand$cor<(-0.5))] | |
subset(cand,(!is.na(cand$cor)&(cand$cor<(-0.5)) | |
) | |
) | |
i=8 | |
f1 <- as.character(cand[i,"TF1"])# | |
f2 <- as.character(cand[i,"TF2"])# | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x) | |
f1 | |
f2 | |
id.f1 | |
id.f2 | |
j1=id.f1 | |
j2=id.f2 | |
factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))] | |
plot(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]),pch=16) | |
factors.expr[c(j1,j2)] | |
factors.expr[c(j1,j2),] | |
i=537 | |
f1 <- as.character(cand[i,"TF1"])# | |
f2 <- as.character(cand[i,"TF2"])# | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x) | |
id.f1 | |
id.f2 | |
j1=id.f1 | |
j2=id.f2 | |
factors.expr[c(j1,j2), | |
] | |
plot(t(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))])) | |
plot(t(log2(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))) | |
j1 | |
plot(t(log2(factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))])), pch=16, xlab=factors.expr[j1,"factor_name"],ylab=factors.expr[j2,"factor_name"]) | |
1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))] | |
log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]) | |
sel.dat <- t(log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))# | |
crs <- c(crs,cor(sel.dat))# | |
plot(sel.dat, pch=16, xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main="tis")# | |
text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1),round(cr,3)) | |
crs <- c() | |
sel.dat <- t(log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))# | |
crs <- c(crs,cor(sel.dat)[1,2]) | |
plot(sel.dat, pch=16, xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main="tis")# | |
text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1),round(cor(sel.dat)[1,2],3)) | |
plot(sel.dat, pch=16, xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main="tis")# | |
text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1), paste("cor =",round(cor(sel.dat)[1,2],3))) | |
plot(sel.dat, pch=16, col="ryalblue", xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main="tis")# | |
text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1), paste("cor =",round(cor(sel.dat)[1,2],3))) | |
plot(sel.dat, pch=16, col="royalblue", xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main="tis")# | |
text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1), paste("cor =",round(cor(sel.dat)[1,2],3))) | |
paste0(tis,factors.expr[c(j1,j2),"factor_name"] ) | |
tis="ESC" | |
paste(factors.expr[c(j1,j2),"factor_name"],sep=" and " ) | |
plot(sel.dat, pch=16, col="royalblue", xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main=paste(factors.expr[c(j1,j2),"factor_name"]),sub=tis)# | |
text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1), paste("cor =",round(cor(sel.dat)[1,2],3))) | |
paste0("candidates/encode/expression/tis_",factors.expr[j1,"factor_name"],"_",factors.expr[j2,"factor_name"],".pdf") | |
paste0("candidates/encode/expression/", tis, "_", factors.expr[j1,"factor_name"], "_", factors.expr[j2,"factor_name"], ".pdf") | |
pdf(paste0("candidates/encode/expression/", tis, "_", factors.expr[j1,"factor_name"], "_", factors.expr[j2,"factor_name"], ".pdf"))# | |
plot(sel.dat, pch=16, col="royalblue", xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main=paste(factors.expr[c(j1,j2),"factor_name"]),sub=tis)# | |
text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1), paste("cor =",round(cor(sel.dat)[1,2],3))) # | |
dev.off() | |
cand$cor <- c()# | |
for (i in 1:nrow(cand)){# | |
f1 <- as.character(cand[i,"TF1"])# | |
f2 <- as.character(cand[i,"TF2"])# | |
id.f1 <- match(f1,factors.expr$x)# | |
id.f2 <- match(f2,factors.expr$x)# | |
if (!is.na(id.f1)&!is.na(id.f2)){# | |
crs <- c()# | |
for (j1 in (id.f1)){# | |
for (j2 in (id.f2)){# | |
sel.dat <- t(log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))# | |
crs <- c(crs,cor(sel.dat)[1,2])# | |
# pdf(paste0("candidates/encode/expression/", tis, "_", factors.expr[j1,"factor_name"], "_", factors.expr[j2,"factor_name"], ".pdf"))# | |
# plot(sel.dat, pch=16, col="royalblue", xlab=factors.expr[j1,"factor_name"], ylab=factors.expr[j2,"factor_name"], main=paste(factors.expr[c(j1,j2),"factor_name"]),sub=tis)# | |
# text(x=mean(sel.dat[,1]),y=max(sel.dat[,2]-0.1), paste("cor =",round(cor(sel.dat)[1,2],3))) # | |
# dev.off() # | |
}# | |
}# | |
cand[i,"cor"] <- mean(crs,na.rm=TRUE) # | |
}# | |
} | |
cand$cor | |
write.csv(cand,"candidates/encode/ESC_matrices_1e-10_expr.csv",row.names=FALSE,quote=FALSE) | |
subset(cand, !is.na(cand$cor)&(cand$cor<-0.2)) | |
subset(cand, !is.na(cand$cor)&(cand$cor<(-0.2))) | |
dim(matrix2gene) | |
dim(factors) | |
head(matrix2gene) | |
ids <- sample(1:nrow(matrix2gene),200)# | |
random.factors <- matrix2gene[ids,] | |
factors.expr <- random.factors | |
for (expr.dat in expres.list[,1]){# | |
gene.expr <- read.delim(file.path("expression/H1-hESC",expr.dat),header=TRUE,as.is=TRUE)# | |
# define the gene id without the numbers after . to be matched with the matrix2gene ids# | |
gene.expr$gene_id_simpl <- gsub("\\..*","",gene.expr$gene_id)# | |
# remove genes with weird ENSEMBL IDs# | |
#gene.expr <- gene.expr[grep("^ENSG",gene.expr$gene_id),]# | |
factors.expr <- merge(x=factors.expr, y=gene.expr[,c("gene_id_simpl","TPM")], by.x="ensembl_id", by.y="gene_id_simpl", all.x=TRUE, suffixes=c("",paste0(".",sub(".tsv","",expr.dat))))# | |
} | |
write.table(factors.expr,"candidates/encode/ESC_expression_randommatrices.dat",sep="\t",col.names=TRUE,row.names=FALSE,quote=FALSE) | |
head(factors.expr) | |
j1=1 | |
j2=3 | |
sel.dat <- t(log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))])) | |
cr <- cor(sel.dat)[1,2] | |
data.frame(factors.expr[j1,c("matrix_id", "factor_name")],factors.expr[j2,c("matrix_id", "factor_name")],cr) | |
random.cors <- data.frame() | |
random.cors <- rbind(random.cors,data.frame(factors.expr[j1,c("matrix_id", "factor_name")],factors.expr[j2,c("matrix_id", "factor_name")],cr)) | |
random.cors | |
random.cors <- data.frame()# | |
for (j1 in 1:(nrow(factors.expr)-1)){# | |
for (j2 in (j1+1):nrow(factors.expr)){ # | |
sel.dat <- t(log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))# | |
cr <- cor(sel.dat)[1,2]# | |
random.cors <- rbind(random.cors,data.frame(factors.expr[j1,c("matrix_id", "factor_name")],factors.expr[j2,c("matrix_id", "factor_name")],cr)) # | |
}# | |
} | |
random.cors | |
setwd("/Volumes/AJAS_DATA/Documents/TF_analysis/") | |
write.csv(random.cors,"candidates/encode/random_correlations_expr.csv", row.names=FALSE, quote=FALSE)) | |
write.csv(random.cors,"candidates/encode/random_correlations_expr.csv", row.names=FALSE, quote=FALSE) | |
head(random.cors) | |
head(cand) | |
colnames(random.cor)<-c("TF1","TF.name1","TF2","TF.name2","cor") | |
colnames(random.cors)<-c("TF1","TF.name1","TF2","TF.name2","cor") | |
write.csv(random.cors,"candidates/encode/random_correlations_expr.csv", row.names=FALSE, quote=FALSE) | |
dat <- rbind(cand[,c("TF1","TF.name1","TF2","TF.name2","cor")],random.cors) | |
dat$type <- c(rep(tis,nrow(cand)),rep("random",nrow(random.cors))) | |
dat$type <- as.factor(dat$type) | |
ggplot(dat, aes(x=cor, color=type, fill=type)) + # | |
geom_histogram(aes(y=..density..), alpha=0.5, # | |
position="identity")+# | |
geom_density(alpha=.2) | |
library(ggplot2) | |
ggplot(dat, aes(x=cor, color=type, fill=type)) + # | |
geom_histogram(aes(y=..density..), alpha=0.5, # | |
position="identity")+# | |
geom_density(alpha=.2) | |
dim(dat) | |
nrow(cand) | |
nrow(random.cors) | |
200*199/2 | |
ggplot(dat, aes(x=cor, color=type, fill=type)) + | |
geom_density(alpha=.2) | |
factors.expr | |
dim(factors) | |
head(factors) | |
head(factors.expr) | |
gene.expr<-NULL | |
factors.expr <- read.delim("candidates/encode/ESC_expression_matrices_1e-10.dat", sep="\t", header=TRUE,row.names=FALSE,as.is=TRUE) | |
factors.expr <- read.delim("candidates/encode/ESC_expression_matrices_1e-10.dat", sep="\t", header=TRUE,as.is=TRUE) | |
head(factors.expr) | |
dim(factors.expr) | |
random.cors <- data.frame()# | |
for (j1 in 1:(nrow(factors.expr)-1)){# | |
for (j2 in (j1+1):nrow(factors.expr)){ # | |
sel.dat <- t(log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))# | |
cr <- cor(sel.dat)[1,2]# | |
random.cors <- rbind(random.cors,data.frame(factors.expr[j1,c("matrix_id", "factor_name")],factors.expr[j2,c("matrix_id", "factor_name")],cr)) # | |
}# | |
} # | |
# | |
colnames(random.cors)<-c("TF1","TF.name1","TF2","TF.name2","cor")# | |
write.csv(random.cors,"candidates/encode/random_correlations_expr.csv", row.names=FALSE, quote=FALSE) | |
random.cors <- data.frame()# | |
for (j1 in 1:(nrow(factors.expr)-1)){# | |
for (j2 in (j1+1):nrow(factors.expr)){ # | |
sel.dat <- t(log2(1+factors.expr[c(j1,j2),grep("^TPM",colnames(factors.expr))]))# | |
cr <- cor(sel.dat)[1,2]# | |
random.cors <- rbind(random.cors,data.frame(factors.expr[j1,c("x", "factor_name")],factors.expr[j2,c("x", "factor_name")],cr)) # | |
}# | |
} # | |
# | |
colnames(random.cors)<-c("TF1","TF.name1","TF2","TF.name2","cor")# | |
write.csv(random.cors,"candidates/encode/random_correlations_expr.csv", row.names=FALSE, quote=FALSE) | |
subset(cand, !is.na(cand$cor)&(cand$cor<(-0.2))) | |
subset(cand, !is.na(cand$cor)&(cand$cor<(-0.5))) | |
mat1 ="OCT4_01" mat2="FOXD3_01" | |
mat1 ="OCT4_01" | |
mat2="FOXD3_01" | |
dat <- rbind(cand[,c("TF1","TF.name1","TF2","TF.name2","cor")],random.cors)# | |
dat$type <- c(rep(tis,nrow(cand)),rep("random",nrow(random.cors)))# | |
dat$type <- as.factor(dat$type) | |
ggplot(dat, aes(x=cor, color=type, fill=type)) + # | |
geom_histogram(aes(y=..density..), alpha=0.5, # | |
position="identity")+# | |
geom_density(alpha=.2) | |
ggplot(dat, aes(x=cor, color=type, fill=type)) + # | |
#geom_histogram(aes(y=..density..), alpha=0.5, position="identity")+# | |
geom_density(alpha=.2) | |
ggplot(dat, aes(x=cor, fill=type)) + # | |
#geom_histogram(aes(y=..density..), alpha=0.5, position="identity")+# | |
geom_density(alpha=.2) | |
ggplot(dat, aes(x=cor, color=type, fill=type)) + # | |
#geom_histogram(aes(y=..density..), alpha=0.5, position="identity")+# | |
geom_density(alpha=.2) +# | |
ggtitle("Gene expression correlations between TF pairs") +# | |
theme_light() | |
dat$type <- c(rep(paste0(tis,"-pairs"),nrow(cand)),rep("all pairs",nrow(random.cors))) | |
dat$type <- as.factor(dat$type) | |
ggplot(dat, aes(x=cor, color=type, fill=type)) + # | |
#geom_histogram(aes(y=..density..), alpha=0.5, position="identity")+# | |
geom_density(alpha=.2) +# | |
ggtitle("Gene expression correlations between TF pairs") +# | |
theme_light() | |
gp <- ggplot(dat, aes(x=cor, color=type, fill=type)) + # | |
#geom_histogram(aes(y=..density..), alpha=0.5, position="identity")+# | |
geom_density(alpha=.2) +# | |
ggtitle("Gene expression correlations between TF pairs") +# | |
theme_light() | |
pdf("candidates/encode/ESC_geneexpr_correlation_hist.pdf")# | |
plot(gp)# | |
dev.off() | |
setwd("~/Documents/GitHub/coTRaCTE/") | |
source("scripts/DHS.functions.github.R") | |
if (!require(devtools)) install.packages("devtools",repos="http://cran.rstudio.com/", quiet = TRUE)# | |
library(devtools, quietly = TRUE)# | |
# | |
#if (!require(tRap)) install_github("matthuska/tRap", quiet = TRUE)# | |
#library(tRap, quietly = TRUE) # TRAP library for counting binding affinities# | |
# | |
if (!require(gplots)) install.packages("gplots",repos="http://cran.rstudio.com/", quiet = TRUE)# | |
library(gplots, quietly = TRUE)# | |
# | |
if (!require(optparse)) install.packages("optparse", repos = "http://cran.rstudio.com/", quiet = TRUE)# | |
library(optparse, quietly = TRUE) | |
option_list = list(# | |
make_option(c("-l", "--lof.matrices"), type="character", default= "data/list_matrices.dat", # | |
help = "list of all matrices for which the enrichment should be calculated", metavar="character"),# | |
make_option(c("-a", "--affinity.dir"), type="character", default="data/affinity", # | |
help="directory where the affinity files (for each matrix separately) are saved, each tissue in separate directory"),# | |
make_option(c("-t", "--tissues"), type="character", default = "data/cell_types.dat", # | |
help="textfile with all tissues analyzed", metavar="character"),# | |
make_option(c("-k", "--top.ranked"), type="integer", default=1000, # | |
help="top ranked DHSs considered to be bound by the TF"),# | |
make_option(c("-n", "--top.dhs"), type="integer", default=5000, # | |
help="top most cell-type specific CTS-DHSs and global DHSs considered"),# | |
make_option(c("-o", "--out.dir"), type="character", default="results/tf.pairs/", # | |
help="directory where to save the results ")# | |
); # | |
# | |
opt_parser = OptionParser(option_list=option_list);# | |
opt = parse_args(opt_parser); | |
opt$tissues | |
tissues <- read.delim(opt$tissues,as.is=TRUE,header=FALSE)# | |
tissues <- tissues[,1]# | |
tissues <- tissues[tissues!="global"] # in case global in the tissue list, remove it from calculations | |
matrices.names <- read.delim(opt$lof.matrices, header=TRUE, as.is=TRUE)# | |
matrices <- matrices.names[,1] | |
matrices | |
grep(mat1,matrices,value=TRUE) | |
mat2 <- grep(mat2,matrices,value=TRUE) | |
mat2 | |
mat1 <- grep(mat1,matrices,value=TRUE) | |
k <- opt$top.ranked# | |
n <- opt$top.dhs | |
k | |
n | |
mat1 ="OCT4_01" # | |
mat2="FOXD3_01" | |
matrix1 <- grep(mat1,matrices,value=TRUE)# | |
matrix2 <- grep(mat2,matrices,value=TRUE) | |
aff.dir=opt$affinity.dir | |
aff1 <- select.dhs(n=n, tis=tis, aff.dir=aff.dir, matrix=matrix1) | |
matrix1 ="OCT4_01" # | |
matrix2="FOXD3_01" | |
aff1 <- select.dhs(n=n, tis=tis, aff.dir=aff.dir, matrix=matrix1) | |
aff2 <- select.dhs(n=n, tis=tis, aff.dir=aff.dir, matrix=matrix2) | |
head(aff1) | |
aff.merged = merge(aff1,aff2,by="dhs",suffixes=c("1","2"),sort=FALSE,all=TRUE) | |
head(aff.merged) | |
aff.sel <- subset(aff.merged, (aff.merged$rank1<=k)&(aff.merged$rank2<=k)) | |
aff.sel | |
dim(aff.sel) | |
aff.sel <- subset(aff.sel, aff.sel$tissue1==tis) | |
dim(aff.sel) | |
bed1 <- strsplit(aff.sel$dhs,split=":") | |
head(bed1) | |
unlist(bed1) | |
seq(1,nrow(bed1),by=2) | |
bed1 <- unlist(bed1) | |
bed1[seq(1,length(bed1),by=2)] | |
bed2 <- strsplit(bed1[seq(2,length(bed1),by=2)],split="-") | |
bed2 | |
bed2 <- strsplit(bed1[seq(2,length(bed1),by=2)],split="-")# | |
bed2 <- unlist(bed2) | |
bed.file <- data.frame(chr=bed1[seq(1,length(bed1),by=2)], start=as.numeric(bed2[seq(1,length(bed2),by=2)]), as.numeric(bed2[seq(2,length(bed2),by=2)])) | |
head(bed.file) | |
bed.file <- data.frame(chr=bed1[seq(1,length(bed1),by=2)], start=as.numeric(bed2[seq(1,length(bed2),by=2)]),end= as.numeric(bed2[seq(2,length(bed2),by=2)])) | |
head(bed.file) | |
write.table(bed.file,"expression/H1-hESC/OCT4_FOXD3_shareddhs_ESC.bed", sep="\t", quote=FALSE, col.names=FALSE, row.names=FALSE) | |
write.table(bed.file,"/Volumes/AJAS_DATA/TF_analysis/expression/H1-hESC/OCT4_FOXD3_shareddhs_ESC.bed", sep="\t", quote=FALSE, col.names=FALSE, row.names=FALSE) | |
write.table(bed.file,"/Volumes/AJAS_DATA/Documents/TF_analysis/expression/H1-hESC/OCT4_FOXD3_shareddhs_ESC.bed", sep="\t", quote=FALSE, col.names=FALSE, row.names=FALSE) | |
bed.file <- data.frame(chr=bed1[seq(1,length(bed1),by=2)], start=as.numeric(bed2[seq(1,length(bed2),by=2)])-1,end= as.numeric(bed2[seq(2,length(bed2),by=2)])) | |
head(bed.file) | |
write.table(bed.file,"/Volumes/AJAS_DATA/Documents/TF_analysis/expression/H1-hESC/OCT4_FOXD3_shareddhs_ESC.bed", sep="\t", quote=FALSE, col.names=FALSE, row.names=FALSE) | |
all.int = read.csv(paste("/Volumes/AJAS_DATA/Documents/TF_analysis/candidates/encode/candidates_all_",l1,"_",l2,"tresh_",tr.const,".csv",sep=""),as.is=TRUE) | |
dim(all.int) | |
head(all.int) | |
general.int <- all.int[all.int$freq>30,] | |
vyber.list.names | |
write.csv(general.int,file=paste0("/Volumes/AJAS_DATA/Documents/TF_analysis/candidates/encode/general.candidates_",l1,"_",l2,"tresh_",tr.const,".csv",sep=""),quote=FALSE,col.names=TRUE,row.names=FALSE) | |
head(matrix2gene) | |
factors <- unique(c(general.int$TF.name1,general.int$TF.name2)) | |
length(factors) | |
factors | |
gen.factors <- unique(c(general.int$TF.name1,general.int$TF.name2))# | |
length(gen.factors) | |
gen.factors <- merge(gen.factors,matrix2gene,by.x=1,by.y="factor_name") | |
dim(gen.factors) | |
gen.factors | |
tf1 <- general.int$TF.name1[i]# | |
tf2 <- general.int$TF.name2[i]# | |
unique(gen.factors[gen.factors$x==tf1,"ensembl_id"])# | |
unique(gen.factors[gen.factors$x==tf2,"ensembl_id"])# | |
id.f1 <- match(tf1,gen.factors$x)# | |
id.f2 <- match(tf2,gen.factors$x) |