Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
f03cddfadf
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
346 lines (288 sloc) 14.8 KB
################# General Correlation (choose method), Permutation Test, all genes ################
# requires:
# tables with the gene functional annotations generated by eggnog
# input:
# ExpressionTableSpecies1 - must be a data.frame
# species1 = c('turtle','lizard','mouse')
# DEgenesSpecies1
# ExpressionTableSpecies2 - must be a data.frame
# species1 = c('turtle','lizard','mouse','human','chicken')
# DEgenesSpecies2
# nPermutations = 1000
# genes.use = c('intersect','union','species1','species2')
# corr.method = c('spearman', 'pearson') etc.
# returns:
# list with
# corr.coeff
# shuffled_correlation_score_means
# p.value
# negative_log10_p.value
# DEgenes_intersect
# DEgenes_in_analysis
# nDEgenes_intersect
# nDEgenes_in_analysis
# nDEgenes_Sp1
# nDEgenes_Sp2
SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){
#Step1: Load eggnog tables
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="mouse") {
eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
DEgenesSpecies1 = toupper(DEgenesSpecies1)
rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
}
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="mouse") {
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
DEgenesSpecies2 = toupper(DEgenesSpecies2)
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
}
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog1) = eggnog1[1,]
eggnog1 = eggnog1[2:nrow(eggnog1),]
colnames(eggnog2) = eggnog2[1,]
eggnog2 = eggnog2[2:nrow(eggnog2),]
#Step2: Take intersect, union, species1 or species2 of DEgenes for analysis
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
nDESp1 = length(DEgenesSpecies1)
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
nDESp2 = length(DEgenesSpecies2)
#genes.use = c('intersect','union','species1','species2')
if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
if (genes.use=='species1') DEgenes = DEgenesSpecies1
if (genes.use=='species2') DEgenes = DEgenesSpecies2
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
#Step3: Prune Expression Tables & Remove rows with no expression
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
Sp1 = Sp1[rowSums (Sp1)!=0,]
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
Sp2 = Sp2[rowSums (Sp2)!=0,]
#Step4: Replace Gene names with Eggnog name
Sp1[,ncol(Sp1)+1] = rownames(Sp1)
colnames(Sp1)[ncol(Sp1)] = species1
Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
rownames(Sp1) = Sp1$eggnog
Sp1 = Sp1[,3:ncol(Sp1)]
Sp2[,ncol(Sp2)+1] = rownames(Sp2)
colnames(Sp2)[ncol(Sp2)] = species2
Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
rownames(Sp2) = Sp2$eggnog
Sp2 = Sp2[,3:ncol(Sp2)]
#Step5: Scale Expression Tables by gene average
avg = rowMeans(Sp1)
Sp1 = sweep(Sp1,1,avg,"/")
rm(avg)
avg = rowMeans(Sp2)
Sp2 = sweep(Sp2,1,avg,"/")
rm(avg)
#Step6: Merge Expression Tables
geTable = merge(Sp1,Sp2, by='row.names', all=F)
rownames(geTable) = geTable$Row.names
geTable = geTable[,2:ncol(geTable)]
#Step7: Correlation
#7a: Correlation
Corr.Coeff.Table = cor(geTable,method=corr.method)
#7b: Shuffle data
shuffled.cor.list = list()
pb <- txtProgressBar(1, 100, style=3)
for (i in 1:nPermutations){
shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
shuffled = cbind(t(shuffled),t(shuffled2))
shuffled.cor = cor(shuffled,method=corr.method)
shuffled.cor.list[[i]] = shuffled.cor
rm(list=c('shuffled','shuffled2','shuffled.cor'))
if ((i %% 100) ==0){
setTxtProgressBar(pb, (i*100)/nPermutations)
}
}
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(p.value.table) = colnames(geTable)
colnames(p.value.table) = colnames(geTable)
shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(shuffled.mean.table) = colnames(geTable)
colnames(shuffled.mean.table) = colnames(geTable)
a = combn(1:ncol(geTable),2)
for (i in 1:ncol(a)){
cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
p.value.table[a[1,i],a[2,i]] = p.value
p.value.table[a[2,i],a[1,i]] = p.value
rm(list=c('cor.scores','p.value'))
setTxtProgressBar(pb, (i*100)/ncol(a))
}
neg.log10.p = -log10(p.value.table)
#step8 "Overlap in Markers"
#for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells
#from above a = combn(1:ncol(geTable),2)
marker.overlap.list = list()
for (i in 1:ncol(a)){
datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]])
markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,])
marker.overlap.list[[i]] = markers
names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_')
rm(list=c('datasubset','markers'))
}
list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list)
names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers')
return(list.to.return)
}
############ General Correlation (choose method), Permutation Test, transcription factors #############
# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563 (transcriptional activator activity) and GO:0016564 (transcriptional repressor activity).
# requires:
# tables with the gene functional annotations generated by eggnog
# list of transcription factors
# input and output as above
SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){
#Step1: Load eggnog tables
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species1=="mouse") {
eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
DEgenesSpecies1 = toupper(DEgenesSpecies1)
rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
}
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="mouse") {
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
DEgenesSpecies2 = toupper(DEgenesSpecies2)
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
}
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
colnames(eggnog1) = eggnog1[1,]
eggnog1 = eggnog1[2:nrow(eggnog1),]
colnames(eggnog2) = eggnog2[1,]
eggnog2 = eggnog2[2:nrow(eggnog2),]
#Step2: Take intersect, union, species1 or species2 of DEgenes for analysis
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
nDESp1 = length(DEgenesSpecies1)
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
nDESp2 = length(DEgenesSpecies2)
#genes.use = c('intersect','union','species1','species2')
if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
if (genes.use=='species1') DEgenes = DEgenesSpecies1
if (genes.use=='species2') DEgenes = DEgenesSpecies2
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
#Step3: Prune Expression Tables & Remove rows with no expression
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
Sp1 = Sp1[rowSums (Sp1)!=0,]
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
Sp2 = Sp2[rowSums (Sp2)!=0,]
#Step4: Replace Gene names with Eggnog name
Sp1[,ncol(Sp1)+1] = rownames(Sp1)
colnames(Sp1)[ncol(Sp1)] = species1
Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
rownames(Sp1) = Sp1$eggnog
Sp1 = Sp1[,3:ncol(Sp1)]
Sp2[,ncol(Sp2)+1] = rownames(Sp2)
colnames(Sp2)[ncol(Sp2)] = species2
Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
rownames(Sp2) = Sp2$eggnog
Sp2 = Sp2[,3:ncol(Sp2)]
# Step5: Take only transcription factors
TFs_list <- read.table("170712_TFs_list_ensembl.txt")
TFs_list <- as.character(TFs_list[,1])
Sp1 <- Sp1[rownames(Sp1) %in% TFs_list,]
Sp2 <- Sp2[rownames(Sp2) %in% TFs_list,]
#Step6: Scale Expression Tables by gene average
avg = rowMeans(Sp1)
Sp1 = sweep(Sp1,1,avg,"/")
rm(avg)
avg = rowMeans(Sp2)
Sp2 = sweep(Sp2,1,avg,"/")
rm(avg)
#Step7: Merge Expression Tables
geTable = merge(Sp1,Sp2, by='row.names', all=F)
rownames(geTable) = geTable$Row.names
geTable = geTable[,2:ncol(geTable)]
#Step8: Correlation
#8a: Correlation
Corr.Coeff.Table = cor(geTable,method=corr.method)
#8b: Shuffle data
shuffled.cor.list = list()
pb <- txtProgressBar(1, 100, style=3)
for (i in 1:nPermutations){
shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
shuffled = cbind(t(shuffled),t(shuffled2))
shuffled.cor = cor(shuffled,method=corr.method)
shuffled.cor.list[[i]] = shuffled.cor
rm(list=c('shuffled','shuffled2','shuffled.cor'))
if ((i %% 100) ==0){
setTxtProgressBar(pb, (i*100)/nPermutations)
}
}
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(p.value.table) = colnames(geTable)
colnames(p.value.table) = colnames(geTable)
shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
rownames(shuffled.mean.table) = colnames(geTable)
colnames(shuffled.mean.table) = colnames(geTable)
a = combn(1:ncol(geTable),2)
for (i in 1:ncol(a)){
cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
p.value.table[a[1,i],a[2,i]] = p.value
p.value.table[a[2,i],a[1,i]] = p.value
rm(list=c('cor.scores','p.value'))
setTxtProgressBar(pb, (i*100)/ncol(a))
}
neg.log10.p = -log10(p.value.table)
#step9 "Overlap in Markers"
#for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells
#from above a = combn(1:ncol(geTable),2)
marker.overlap.list = list()
for (i in 1:ncol(a)){
datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]])
markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,])
marker.overlap.list[[i]] = markers
names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_')
rm(list=c('datasubset','markers'))
}
list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list)
names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers')
return(list.to.return)
}
################# NoisyGenes function ################
# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy, and can be removed before running comparative analysis
# clusters.use = character vectors of the clusters ids to include in the analysis
# Example for comparative analysis:
# object_noisy_genes <- NoisyGenes(object, min.pct=0.2, clusters.use=as.character(levels(object@ident)))
# object_avg_expr <- AverageExpression(object)
# ExpressionTableSpecies1 <- object_avg_expr[!rownames(object_avg_expr) %in% object_noisy_genes,]
NoisyGenes <- function(object, min.pct=0.2, clusters.use) {
clusters <- clusters.use
genes.use = rownames(object@data)
object.raw.data <- as.matrix(object@raw.data)
pct.matrix = matrix(data=NA, nrow=length(genes.use), ncol=length(clusters))
rownames(pct.matrix) <- genes.use
colnames(pct.matrix) <- clusters
thresh.min=0
for (i in clusters){
cells.cluster <- names(object@ident[object@ident==i])
data.cluster <- object.raw.data[,colnames(object.raw.data) %in% cells.cluster]
pct.cluster <- round(apply(object.raw.data[genes.use, cells.cluster, drop = F],1,function(x)return(length(x[x>thresh.min])/length(x))),3)
pct.matrix[,i] <- pct.cluster
}
pct.max <- rowMaxs(pct.matrix)
names(pct.max) <- genes.use
noisy.genes <- names(pct.max[pct.max < min.pct])
return(noisy.genes)
}