Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
fe016fb5af
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
729 lines (614 sloc) 31.5 KB
#make sure these files are in your working directory:
#chrysemys_eggnog_pruned.txt
#pogona_eggnog_pruned.txt
#gallus_eggnog_pruned.txt
#homo_eggnog_pruned.txt
#mus_eggnog_pruned.txt
####################################METHOD1#####################
####################################Specificity Correlation#####
#scale to gene expression average
SpecificityCorrelation = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
#Step1: Load eggnog tables
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="mouse") {
eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
DEgenesSpecies1 = toupper(DEgenesSpecies1)
rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
}
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="mouse") {
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
DEgenesSpecies2 = toupper(DEgenesSpecies2)
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
}
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog1) = eggnog1[1,]
eggnog1 = eggnog1[2:nrow(eggnog1),]
colnames(eggnog2) = eggnog2[1,]
eggnog2 = eggnog2[2:nrow(eggnog2),]
#Step2: Take union of DEgenes for analysis
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
#Step3: Prune Expression Tables & Remove rows with no expression
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
Sp1 = Sp1[rowSums (Sp1)!=0,]
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
Sp2 = Sp2[rowSums (Sp2)!=0,]
#Step4: Replace Gene names with Eggnog name
Sp1[,ncol(Sp1)+1] = rownames(Sp1)
colnames(Sp1)[ncol(Sp1)] = species1
Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
rownames(Sp1) = Sp1$eggnog
Sp1 = Sp1[,3:ncol(Sp1)]
Sp2[,ncol(Sp2)+1] = rownames(Sp2)
colnames(Sp2)[ncol(Sp2)] = species2
Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
rownames(Sp2) = Sp2$eggnog
Sp2 = Sp2[,3:ncol(Sp2)]
#Step5: Scale Expression Tables by gene average
avg = rowMeans(Sp1)
Sp1 = sweep(Sp1,1,avg,"/")
rm(avg)
avg = rowMeans(Sp2)
Sp2 = sweep(Sp2,1,avg,"/")
rm(avg)
#Step6: Merge Expression Tables
geTable = merge(Sp1,Sp2, by='row.names', all=F)
rownames(geTable) = geTable$Row.names
geTable = geTable[,2:ncol(geTable)]
#Step7: Correlation
Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
rownames(Corr.Coeff.Table) = colnames(geTable)
colnames(Corr.Coeff.Table) = colnames(geTable)
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(p.value.table) = colnames(geTable)
colnames(p.value.table) = colnames(geTable)
a = combn(1:ncol(geTable),2)
n4BHcorrect = ncol(geTable)*ncol(geTable)
for (i in 1:ncol(a)){
cell_type1 = colnames(geTable)[a[1,i]]
cell_type2 = colnames(geTable)[a[2,i]]
b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
rm(list =c('cell_type1', 'cell_type2','b'))
}
neg.log10.p = -log10(p.value.table)
list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value','DEgenes_union','DEgenes_in_analysis','nDEgenes_union','nDEgenes_in_analysis',paste('nDEgenes',species1,sep='_'),paste('nDEgenes',species2,sep='_'))
return(list.to.return)
}
#Specificity Using Intersect of Gene Lists
SpIntersect = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
#Step1: Load eggnog tables
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="mouse") {
eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
DEgenesSpecies1 = toupper(DEgenesSpecies1)
rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
}
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="mouse") {
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
DEgenesSpecies2 = toupper(DEgenesSpecies2)
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
}
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog1) = eggnog1[1,]
eggnog1 = eggnog1[2:nrow(eggnog1),]
colnames(eggnog2) = eggnog2[1,]
eggnog2 = eggnog2[2:nrow(eggnog2),]
#Step2: Take union of DEgenes for analysis
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
nDESp1 = length(DEgenesSpecies1)
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
nDESp2 = length(DEgenesSpecies2)
DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
#Step3: Prune Expression Tables & Remove rows with no expression
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
Sp1 = Sp1[rowSums (Sp1)!=0,]
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
Sp2 = Sp2[rowSums (Sp2)!=0,]
#Step4: Replace Gene names with Eggnog name
Sp1[,ncol(Sp1)+1] = rownames(Sp1)
colnames(Sp1)[ncol(Sp1)] = species1
Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
rownames(Sp1) = Sp1$eggnog
Sp1 = Sp1[,3:ncol(Sp1)]
Sp2[,ncol(Sp2)+1] = rownames(Sp2)
colnames(Sp2)[ncol(Sp2)] = species2
Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
rownames(Sp2) = Sp2$eggnog
Sp2 = Sp2[,3:ncol(Sp2)]
#Step5: Scale Expression Tables by gene average
avg = rowMeans(Sp1)
Sp1 = sweep(Sp1,1,avg,"/")
rm(avg)
avg = rowMeans(Sp2)
Sp2 = sweep(Sp2,1,avg,"/")
rm(avg)
#Step6: Merge Expression Tables
geTable = merge(Sp1,Sp2, by='row.names', all=F)
rownames(geTable) = geTable$Row.names
geTable = geTable[,2:ncol(geTable)]
#Step7: Correlation
Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
rownames(Corr.Coeff.Table) = colnames(geTable)
colnames(Corr.Coeff.Table) = colnames(geTable)
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(p.value.table) = colnames(geTable)
colnames(p.value.table) = colnames(geTable)
a = combn(1:ncol(geTable),2)
n4BHcorrect = ncol(geTable)*ncol(geTable)
for (i in 1:ncol(a)){
cell_type1 = colnames(geTable)[a[1,i]]
cell_type2 = colnames(geTable)[a[2,i]]
b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
rm(list =c('cell_type1', 'cell_type2','b'))
}
neg.log10.p = -log10(p.value.table)
list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis',paste('nDEgenes',species1,sep='_'),paste('nDEgenes',species2,sep='_'))
return(list.to.return)
}
####################################METHOD2#####################
####################################Rank Correlation#####
#scale to gene expression average
RankCorrelation = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
#Step1: Load eggnog tables
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="mouse") {
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
DEgenesSpecies2 = toupper(DEgenesSpecies2)
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
}
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog1) = eggnog1[1,]
eggnog1 = eggnog1[2:nrow(eggnog1),]
colnames(eggnog2) = eggnog2[1,]
eggnog2 = eggnog2[2:nrow(eggnog2),]
#Step2: Take union of DEgenes for analysis
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
#Step3: Prune Expression Tables
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
#Step4: Replace Gene names with Eggnog name
Sp1[,ncol(Sp1)+1] = rownames(Sp1)
colnames(Sp1)[ncol(Sp1)] = species1
Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
rownames(Sp1) = Sp1$eggnog
Sp1 = Sp1[,3:ncol(Sp1)]
Sp2[,ncol(Sp2)+1] = rownames(Sp2)
colnames(Sp2)[ncol(Sp2)] = species2
Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
rownames(Sp2) = Sp2$eggnog
Sp2 = Sp2[,3:ncol(Sp2)]
#Step5: Merge Expression Tables
geTable = merge(Sp1,Sp2, by='row.names', all=F)
rownames(geTable) = geTable$Row.names
geTable = geTable[,2:ncol(geTable)]
#Step6: Correlation
rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
rownames(rho.table) = colnames(geTable)
colnames(rho.table) = colnames(geTable)
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(p.value.table) = colnames(geTable)
colnames(p.value.table) = colnames(geTable)
a = combn(1:ncol(geTable),2)
n4BHcorrect = ncol(geTable)*ncol(geTable)
for (i in 1:ncol(a)){
cell_type1 = colnames(geTable)[a[1,i]]
cell_type2 = colnames(geTable)[a[2,i]]
b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='spearman'))
rho.table[a[1,i],a[2,i]] = b$estimate
rho.table[a[2,i],a[1,i]] = b$estimate
BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
rm(list =c('cell_type1', 'cell_type2','b'))
}
neg.log10.p = -log10(p.value.table)
list.to.return = list(rho.table,p.value.table,neg.log10.p)
names(list.to.return) = c('rho','p.value','neg.log10.p.value')
return(list.to.return)
}
####################Rank Correlation All Genes######
RankCorrelationAll = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken')){
#Step1: Load eggnog tables
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="mouse") {
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
}
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog1) = eggnog1[1,]
eggnog1 = eggnog1[2:nrow(eggnog1),]
colnames(eggnog2) = eggnog2[1,]
eggnog2 = eggnog2[2:nrow(eggnog2),]
#Step2: Replace Gene names with Eggnog name
Sp1 = ExpressionTableSpecies1
Sp1[,ncol(Sp1)+1] = rownames(Sp1)
colnames(Sp1)[ncol(Sp1)] = species1
Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
rownames(Sp1) = Sp1$eggnog
Sp1 = Sp1[,3:ncol(Sp1)]
Sp2 = ExpressionTableSpecies2
Sp2[,ncol(Sp2)+1] = rownames(Sp2)
colnames(Sp2)[ncol(Sp2)] = species2
Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
rownames(Sp2) = Sp2$eggnog
Sp2 = Sp2[,3:ncol(Sp2)]
#Step3: Merge Expression Tables
geTable = merge(Sp1,Sp2, by='row.names', all=F)
rownames(geTable) = geTable$Row.names
geTable = geTable[,2:ncol(geTable)]
#Step4: Correlation
rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
rownames(rho.table) = colnames(geTable)
colnames(rho.table) = colnames(geTable)
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(p.value.table) = colnames(geTable)
colnames(p.value.table) = colnames(geTable)
a = combn(1:ncol(geTable),2)
n4BHcorrect = ncol(geTable)*ncol(geTable)
for (i in 1:ncol(a)){
cell_type1 = colnames(geTable)[a[1,i]]
cell_type2 = colnames(geTable)[a[2,i]]
b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='spearman'))
rho.table[a[1,i],a[2,i]] = b$estimate
rho.table[a[2,i],a[1,i]] = b$estimate
BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
rm(list =c('cell_type1', 'cell_type2','b'))
}
neg.log10.p = -log10(p.value.table)
list.to.return = list(rho.table,p.value.table,neg.log10.p)
names(list.to.return) = c('rho','p.value','neg.log10.p.value')
return(list.to.return)
}
########################METHOD 3#######
########################hypergeometric distribution#####
#Step 1: generate list of markers for cell types
#requires FindMarkers.MAST function
Marker_List = function(SeuratObject,clusters,min.pct=0.4,thresh.use=log(1.5),fdr=0.05){
#step1a generate pairwise DEgenes by MAST
DEgenes = list()
combinations = t(combn(as.character(clusters),2))
colnames(combinations) <- c("ident.1","ident.2")
pb <- txtProgressBar(1, 100, style=3)
for (i in 1:nrow(combinations)){
a = combinations[i,1]
b = combinations[i,2]
suppressMessages(diff.genes <- FindMarkers.MAST(SeuratObject,a,b,min.pct,thresh.use))
#matrix from FindMarkers.MAST only contains genes that have a fdr<0.05
DEgenes[[paste("id1",a,"id2",b, sep="_")]] = rownames(diff.genes[diff.genes$avg_diff>0 & diff.genes$fdr<fdr,])
DEgenes[[paste("id1",b,"id2",a, sep="_")]] = rownames(diff.genes[diff.genes$avg_diff<0 & diff.genes$fdr<fdr,])
rm(list = c('diff.genes','a','b'))
setTxtProgressBar(pb, (i*100)/nrow(combinations))
}
#step1b generate markers based on union of markers for a celltype
markers = list()
for (i in 1:length(clusters)) {
sub.list = DEgenes[grep(paste('id1_',clusters[i], "_",sep=""),names(DEgenes))]
gene.union = vector()
for (j in 1:length(sub.list)){
gene.union = union(gene.union,sub.list[[j]])
}
markers[[as.character(clusters)[i]]] = gene.union
rm(list=c("sub.list","gene.union"))
}
return(list(DEgenes,markers))
}
#Step 2: replace markers with eggnog names
# and generate table with number of markers and number of eggnog markers per cell type
Eggnog_list_replace = function(genelist_list, species = c('turtle','lizard','mouse','human','chicken')){
#Step1: Load eggnog tables
if (species=="turtle") eggnog = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species=="lizard") eggnog = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species=="mouse") {
eggnog = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog[2:nrow(eggnog),1] = toupper(eggnog[2:nrow(eggnog),1])
for (i in 1:length(genelist_list)){
genelist_list[[i]] = toupper(genelist_list[[i]])
}
}
if (species=="human") eggnog = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species=="chicken") eggnog = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog) = eggnog[1,]
eggnog = eggnog[2:nrow(eggnog),]
#Step2 replace names
eggnog_list = list()
for (i in 1:length(genelist_list)) {
eggnog_list[[i]] = eggnog[eggnog[,1] %in% genelist_list[[i]],2]
names(eggnog_list)[i] = names(genelist_list)[i]
}
#Step3 generate summary table to append to eggnog_list
summary.matrix = cbind(sapply(genelist_list,length),sapply(eggnog_list,length))
colnames(summary.matrix) = c('nMarkers','nMarkers_eggnog')
rownames(summary.matrix) = names(genelist_list)
eggnog_list[['summary']] = summary.matrix
return(eggnog_list)
}
#Step 3: p values for overlap
hygecdf_matrix = function(eggnog.list1, sp1='tur', eggnog.list2, sp2='liz'){
names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
summaries = list(eggnog.list1[[paste(sp1,'summary', sep="_")]], eggnog.list2[[paste(sp2,'summary', sep="_")]])
eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
gene.list = c(eggnog.list1, eggnog.list2)
hyg = matrix(nrow=length(gene.list),ncol=length(gene.list))
colnames(hyg) = c(names(gene.list))
rownames(hyg) = c(names(gene.list))
JSC = matrix(nrow=length(gene.list),ncol=length(gene.list))
colnames(JSC) = c(names(gene.list))
rownames(JSC) = c(names(gene.list))
nGenes = vector()
for(i in 1:length(gene.list)){
nGenes = union(nGenes,gene.list[[i]])
}
nGenes = length(nGenes)
#nGenes = 10089 #number of turtle v lizard eggnog genes
for (i in 1:length(gene.list)){
m = length(gene.list[[i]])
n = nGenes - m
n4BHcorrect = nrow(hyg)*ncol(hyg)
for (j in 1:length(gene.list)){
k = length(gene.list[[i]])
q = length(intersect(gene.list[[i]],gene.list[[j]]))
p.value = phyper(q,m,n,k, lower.tail=F)
#BH correct p values in hyg
p.value = p.adjust(p.value, method='BH',n=n4BHcorrect)
hyg[i,j] = p.value
u = length(union(gene.list[[i]],gene.list[[j]]))
JSC[i,j] = q/u
rm(list=c('k','q','u'))
}
rm(m)
}
negLog10 = -log10(hyg)
return.list = list(hyg,negLog10,JSC,summaries)
names(return.list) = c('hypergeometric_distribution','negative_log10','JSC','summaries')
return(return.list)
}
#Just Calculate Overlap
overlap = function(eggnog.list1, sp1='sp1', eggnog.list2, sp2='sp2'){
names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
gene.list = c(eggnog.list1, eggnog.list2)
overlap.matrix = matrix(nrow=length(gene.list),ncol=length(gene.list))
colnames(overlap.matrix) = c(names(gene.list))
rownames(overlap.matrix) = c(names(gene.list))
a = combn(1:ncol(overlap.matrix),2)
for (i in 1:ncol(a)){
cell1 = a[1,i]
cell2 = a[2,i]
nGenes = length(intersect(gene.list[[cell1]],gene.list[[cell2]]))
overlap.matrix[cell1,cell2] = nGenes
overlap.matrix[cell2,cell1] = nGenes
}
for(i in 1:nrow(overlap.matrix)){
overlap.matrix[i,i] = length(gene.list[[i]])
}
return(overlap.matrix)
}
#Fake Binarization
fake.binary = function(eggnog.list1, sp1='sp1', eggnog.list2, sp2='sp2'){
names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
gene.list = c(eggnog.list1, eggnog.list2)
#step1: create list of markers for analysis
markers.sp1 = vector()
for(i in 1:length(eggnog.list1)){
markers.sp1 = union(markers.sp1,eggnog.list1[[i]])
}
markers.sp2 = vector()
for(i in 1:length(eggnog.list2)){
markers.sp2 = union(markers.sp2,eggnog.list2[[i]])
}
#markers = union(markers.sp1,markers.sp2)
markers = intersect(markers.sp1,markers.sp2)
#step2: create matrix of presence/absence of markers
expression.matrix = data.frame(row.names=markers)
for (i in 1:length(gene.list)){
expression.matrix[rownames(expression.matrix) %in% gene.list[[i]],names(gene.list)[i]] = 1
expression.matrix[!(rownames(expression.matrix) %in% gene.list[[i]]),names(gene.list)[i]] = 0
}
#step3: correlation (pearson)
Corr.Coeff.Table = diag(x = 1, length(gene.list), length(gene.list))
rownames(Corr.Coeff.Table) = names(gene.list)
colnames(Corr.Coeff.Table) = names(gene.list)
p.value.table = matrix(ncol=length(gene.list), nrow = length(gene.list))
rownames(p.value.table) = names(gene.list)
colnames(p.value.table) = names(gene.list)
a = combn(1:length(gene.list),2)
n4BHcorrect = length(gene.list)*length(gene.list)
for (i in 1:ncol(a)){
cell_type1 = names(gene.list)[a[1,i]]
cell_type2 = names(gene.list)[a[2,i]]
b = suppressWarnings(cor.test(expression.matrix[,cell_type1], expression.matrix[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
rm(list =c('cell_type1', 'cell_type2','b'))
}
neg.log10.p = -log10(p.value.table)
#return
list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,expression.matrix)
names(list.to.return) = c('correlation_coeff','p_values','neg_log10_p','expression_table')
return(list.to.return)
}
##########################Old Scaling Methods###########
#scaling.method options
#'minmax' sets min at 0 and max at 1
#'max' expression as fraction of max
#'yao' expression according to Yao et al Cell Stem Cell 2017
ScaledCorrelation = function(scaling.method = c('minmax','max','yao'),ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
#Step1: Load eggnog tables
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="mouse") {
eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
DEgenesSpecies1 = toupper(DEgenesSpecies1)
rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
}
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="mouse") {
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
DEgenesSpecies2 = toupper(DEgenesSpecies2)
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
}
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog1) = eggnog1[1,]
eggnog1 = eggnog1[2:nrow(eggnog1),]
colnames(eggnog2) = eggnog2[1,]
eggnog2 = eggnog2[2:nrow(eggnog2),]
#Step2: Take union of DEgenes for analysis
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
#Step3: Prune Expression Tables & Remove rows with no expression
ExpressionTableSpecies1 = ExpressionTableSpecies1[rowSums(ExpressionTableSpecies1)!=0,]
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
ExpressionTableSpecies2 = ExpressionTableSpecies2[rowSums(ExpressionTableSpecies2)!=0,]
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
#Step4: Replace Gene names with Eggnog name
Sp1[,ncol(Sp1)+1] = rownames(Sp1)
colnames(Sp1)[ncol(Sp1)] = species1
Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
rownames(Sp1) = Sp1$eggnog
Sp1 = Sp1[,3:ncol(Sp1)]
Sp2[,ncol(Sp2)+1] = rownames(Sp2)
colnames(Sp2)[ncol(Sp2)] = species2
Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
rownames(Sp2) = Sp2$eggnog
Sp2 = Sp2[,3:ncol(Sp2)]
#Step5: Scale Expression Tables by scaling.method
if(scaling.method=='minmax'){
gene_min = apply(Sp1, 1, min)
Sp1 = sweep(Sp1,1,gene_min,"-")
gene_max = apply(Sp1, 1, max)
Sp1 = sweep(Sp1,1,gene_max,"/")
rm(list=c('gene_min','gene_max'))
gene_min = apply(Sp2, 1, min)
Sp2 = sweep(Sp2,1,gene_min,"-")
gene_max = apply(Sp2, 1, max)
Sp2 = sweep(Sp2,1,gene_max,"/")
rm(list=c('gene_min','gene_max'))
}
if(scaling.method=='max'){
gene_max = apply(Sp1, 1, max)
Sp1 = sweep(Sp1,1,gene_max,"/")
rm(gene_max)
gene_max = apply(Sp2, 1, max)
Sp2 = sweep(Sp2,1,gene_max,"/")
rm(gene_max)
}
if(scaling.method=='yao'){
gene_max = apply(Sp1, 1, max)
gene_min = apply(Sp1, 1, min)
#Determine median for entire dataset, not just pruned data!!!
med = apply(ExpressionTableSpecies1,1,max)
med = median(med)
for (i in 1:nrow(Sp1)) {
if (gene_max[i]>med){
Sp1[i,] = sapply(Sp1[i,],function(x)(x-gene_min[i])/gene_max[i])
}
else {
Sp1[i,] = sapply(Sp1[i,],function(x)(x-gene_min[i])/med)
}
}
rm(list=c('gene_max','gene_min','med'))
gene_max = apply(Sp2, 1, max)
gene_min = apply(Sp2, 1, min)
med = apply(ExpressionTableSpecies2,1,max)
med = median(med)
for (i in 1:nrow(Sp2)) {
if (gene_max[i]>med){
Sp2[i,] = sapply(Sp2[i,],function(x)(x-gene_min[i])/gene_max[i])
}
else {
Sp2[i,] = sapply(Sp2[i,],function(x)(x-gene_min[i])/med)
}
}
rm(list=c('gene_max','gene_min','med'))
}
#Step6: Merge Expression Tables
geTable = merge(Sp1,Sp2, by='row.names', all=F)
rownames(geTable) = geTable$Row.names
geTable = geTable[,2:ncol(geTable)]
#Step7: Correlation
Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
rownames(Corr.Coeff.Table) = colnames(geTable)
colnames(Corr.Coeff.Table) = colnames(geTable)
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(p.value.table) = colnames(geTable)
colnames(p.value.table) = colnames(geTable)
a = combn(1:ncol(geTable),2)
n4BHcorrect = ncol(geTable)*ncol(geTable)
for (i in 1:ncol(a)){
cell_type1 = colnames(geTable)[a[1,i]]
cell_type2 = colnames(geTable)[a[2,i]]
b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
rm(list =c('cell_type1', 'cell_type2','b'))
}
neg.log10.p = -log10(p.value.table)
list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p)
names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value')
return(list.to.return)
}