This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
ReptilePallium/Comparative_analysis/Comparative_functions.R
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
346 lines (288 sloc)
14.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################# General Correlation (choose method), Permutation Test, all genes ################ | |
# requires: | |
# tables with the gene functional annotations generated by eggnog | |
# input: | |
# ExpressionTableSpecies1 - must be a data.frame | |
# species1 = c('turtle','lizard','mouse') | |
# DEgenesSpecies1 | |
# ExpressionTableSpecies2 - must be a data.frame | |
# species1 = c('turtle','lizard','mouse','human','chicken') | |
# DEgenesSpecies2 | |
# nPermutations = 1000 | |
# genes.use = c('intersect','union','species1','species2') | |
# corr.method = c('spearman', 'pearson') etc. | |
# returns: | |
# list with | |
# corr.coeff | |
# shuffled_correlation_score_means | |
# p.value | |
# negative_log10_p.value | |
# DEgenes_intersect | |
# DEgenes_in_analysis | |
# nDEgenes_intersect | |
# nDEgenes_in_analysis | |
# nDEgenes_Sp1 | |
# nDEgenes_Sp2 | |
SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){ | |
#Step1: Load eggnog tables | |
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species1=="mouse") { | |
eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1]) | |
DEgenesSpecies1 = toupper(DEgenesSpecies1) | |
rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1)) | |
} | |
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species2=="mouse") { | |
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1]) | |
DEgenesSpecies2 = toupper(DEgenesSpecies2) | |
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2)) | |
} | |
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
colnames(eggnog1) = eggnog1[1,] | |
eggnog1 = eggnog1[2:nrow(eggnog1),] | |
colnames(eggnog2) = eggnog2[1,] | |
eggnog2 = eggnog2[2:nrow(eggnog2),] | |
#Step2: Take intersect, union, species1 or species2 of DEgenes for analysis | |
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2] | |
nDESp1 = length(DEgenesSpecies1) | |
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2] | |
nDESp2 = length(DEgenesSpecies2) | |
#genes.use = c('intersect','union','species1','species2') | |
if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2) | |
if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2) | |
if (genes.use=='species1') DEgenes = DEgenesSpecies1 | |
if (genes.use=='species2') DEgenes = DEgenesSpecies2 | |
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1] | |
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1] | |
#Step3: Prune Expression Tables & Remove rows with no expression | |
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,] | |
Sp1 = Sp1[rowSums (Sp1)!=0,] | |
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,] | |
Sp2 = Sp2[rowSums (Sp2)!=0,] | |
#Step4: Replace Gene names with Eggnog name | |
Sp1[,ncol(Sp1)+1] = rownames(Sp1) | |
colnames(Sp1)[ncol(Sp1)] = species1 | |
Sp1 = merge(eggnog1, Sp1, by =species1, all = F) | |
rownames(Sp1) = Sp1$eggnog | |
Sp1 = Sp1[,3:ncol(Sp1)] | |
Sp2[,ncol(Sp2)+1] = rownames(Sp2) | |
colnames(Sp2)[ncol(Sp2)] = species2 | |
Sp2 = merge(eggnog2, Sp2, by =species2, all = F) | |
rownames(Sp2) = Sp2$eggnog | |
Sp2 = Sp2[,3:ncol(Sp2)] | |
#Step5: Scale Expression Tables by gene average | |
avg = rowMeans(Sp1) | |
Sp1 = sweep(Sp1,1,avg,"/") | |
rm(avg) | |
avg = rowMeans(Sp2) | |
Sp2 = sweep(Sp2,1,avg,"/") | |
rm(avg) | |
#Step6: Merge Expression Tables | |
geTable = merge(Sp1,Sp2, by='row.names', all=F) | |
rownames(geTable) = geTable$Row.names | |
geTable = geTable[,2:ncol(geTable)] | |
#Step7: Correlation | |
#7a: Correlation | |
Corr.Coeff.Table = cor(geTable,method=corr.method) | |
#7b: Shuffle data | |
shuffled.cor.list = list() | |
pb <- txtProgressBar(1, 100, style=3) | |
for (i in 1:nPermutations){ | |
shuffled = apply(geTable[,1:ncol(Sp1)],1,sample) | |
shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample) | |
shuffled = cbind(t(shuffled),t(shuffled2)) | |
shuffled.cor = cor(shuffled,method=corr.method) | |
shuffled.cor.list[[i]] = shuffled.cor | |
rm(list=c('shuffled','shuffled2','shuffled.cor')) | |
if ((i %% 100) ==0){ | |
setTxtProgressBar(pb, (i*100)/nPermutations) | |
} | |
} | |
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable)) | |
rownames(p.value.table) = colnames(geTable) | |
colnames(p.value.table) = colnames(geTable) | |
shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable)) | |
rownames(shuffled.mean.table) = colnames(geTable) | |
colnames(shuffled.mean.table) = colnames(geTable) | |
a = combn(1:ncol(geTable),2) | |
for (i in 1:ncol(a)){ | |
cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i]) | |
shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores) | |
shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores) | |
p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]])) | |
p.value.table[a[1,i],a[2,i]] = p.value | |
p.value.table[a[2,i],a[1,i]] = p.value | |
rm(list=c('cor.scores','p.value')) | |
setTxtProgressBar(pb, (i*100)/ncol(a)) | |
} | |
neg.log10.p = -log10(p.value.table) | |
#step8 "Overlap in Markers" | |
#for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells | |
#from above a = combn(1:ncol(geTable),2) | |
marker.overlap.list = list() | |
for (i in 1:ncol(a)){ | |
datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]]) | |
markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,]) | |
marker.overlap.list[[i]] = markers | |
names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_') | |
rm(list=c('datasubset','markers')) | |
} | |
list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list) | |
names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers') | |
return(list.to.return) | |
} | |
############ General Correlation (choose method), Permutation Test, transcription factors ############# | |
# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563 (transcriptional activator activity) and GO:0016564 (transcriptional repressor activity). | |
# requires: | |
# tables with the gene functional annotations generated by eggnog | |
# list of transcription factors | |
# input and output as above | |
SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){ | |
#Step1: Load eggnog tables | |
if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species1=="mouse") { | |
eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1]) | |
DEgenesSpecies1 = toupper(DEgenesSpecies1) | |
rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1)) | |
} | |
if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species2=="mouse") { | |
eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1]) | |
DEgenesSpecies2 = toupper(DEgenesSpecies2) | |
rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2)) | |
} | |
if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F) | |
colnames(eggnog1) = eggnog1[1,] | |
eggnog1 = eggnog1[2:nrow(eggnog1),] | |
colnames(eggnog2) = eggnog2[1,] | |
eggnog2 = eggnog2[2:nrow(eggnog2),] | |
#Step2: Take intersect, union, species1 or species2 of DEgenes for analysis | |
DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2] | |
nDESp1 = length(DEgenesSpecies1) | |
DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2] | |
nDESp2 = length(DEgenesSpecies2) | |
#genes.use = c('intersect','union','species1','species2') | |
if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2) | |
if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2) | |
if (genes.use=='species1') DEgenes = DEgenesSpecies1 | |
if (genes.use=='species2') DEgenes = DEgenesSpecies2 | |
DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1] | |
DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1] | |
#Step3: Prune Expression Tables & Remove rows with no expression | |
Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,] | |
Sp1 = Sp1[rowSums (Sp1)!=0,] | |
Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,] | |
Sp2 = Sp2[rowSums (Sp2)!=0,] | |
#Step4: Replace Gene names with Eggnog name | |
Sp1[,ncol(Sp1)+1] = rownames(Sp1) | |
colnames(Sp1)[ncol(Sp1)] = species1 | |
Sp1 = merge(eggnog1, Sp1, by =species1, all = F) | |
rownames(Sp1) = Sp1$eggnog | |
Sp1 = Sp1[,3:ncol(Sp1)] | |
Sp2[,ncol(Sp2)+1] = rownames(Sp2) | |
colnames(Sp2)[ncol(Sp2)] = species2 | |
Sp2 = merge(eggnog2, Sp2, by =species2, all = F) | |
rownames(Sp2) = Sp2$eggnog | |
Sp2 = Sp2[,3:ncol(Sp2)] | |
# Step5: Take only transcription factors | |
TFs_list <- read.table("170712_TFs_list_ensembl.txt") | |
TFs_list <- as.character(TFs_list[,1]) | |
Sp1 <- Sp1[rownames(Sp1) %in% TFs_list,] | |
Sp2 <- Sp2[rownames(Sp2) %in% TFs_list,] | |
#Step6: Scale Expression Tables by gene average | |
avg = rowMeans(Sp1) | |
Sp1 = sweep(Sp1,1,avg,"/") | |
rm(avg) | |
avg = rowMeans(Sp2) | |
Sp2 = sweep(Sp2,1,avg,"/") | |
rm(avg) | |
#Step7: Merge Expression Tables | |
geTable = merge(Sp1,Sp2, by='row.names', all=F) | |
rownames(geTable) = geTable$Row.names | |
geTable = geTable[,2:ncol(geTable)] | |
#Step8: Correlation | |
#8a: Correlation | |
Corr.Coeff.Table = cor(geTable,method=corr.method) | |
#8b: Shuffle data | |
shuffled.cor.list = list() | |
pb <- txtProgressBar(1, 100, style=3) | |
for (i in 1:nPermutations){ | |
shuffled = apply(geTable[,1:ncol(Sp1)],1,sample) | |
shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample) | |
shuffled = cbind(t(shuffled),t(shuffled2)) | |
shuffled.cor = cor(shuffled,method=corr.method) | |
shuffled.cor.list[[i]] = shuffled.cor | |
rm(list=c('shuffled','shuffled2','shuffled.cor')) | |
if ((i %% 100) ==0){ | |
setTxtProgressBar(pb, (i*100)/nPermutations) | |
} | |
} | |
p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable)) | |
rownames(p.value.table) = colnames(geTable) | |
colnames(p.value.table) = colnames(geTable) | |
shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable)) | |
rownames(shuffled.mean.table) = colnames(geTable) | |
colnames(shuffled.mean.table) = colnames(geTable) | |
a = combn(1:ncol(geTable),2) | |
for (i in 1:ncol(a)){ | |
cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i]) | |
shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores) | |
shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores) | |
p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]])) | |
p.value.table[a[1,i],a[2,i]] = p.value | |
p.value.table[a[2,i],a[1,i]] = p.value | |
rm(list=c('cor.scores','p.value')) | |
setTxtProgressBar(pb, (i*100)/ncol(a)) | |
} | |
neg.log10.p = -log10(p.value.table) | |
#step9 "Overlap in Markers" | |
#for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells | |
#from above a = combn(1:ncol(geTable),2) | |
marker.overlap.list = list() | |
for (i in 1:ncol(a)){ | |
datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]]) | |
markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,]) | |
marker.overlap.list[[i]] = markers | |
names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_') | |
rm(list=c('datasubset','markers')) | |
} | |
list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list) | |
names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers') | |
return(list.to.return) | |
} | |
################# NoisyGenes function ################ | |
# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy, and can be removed before running comparative analysis | |
# clusters.use = character vectors of the clusters ids to include in the analysis | |
# Example for comparative analysis: | |
# object_noisy_genes <- NoisyGenes(object, min.pct=0.2, clusters.use=as.character(levels(object@ident))) | |
# object_avg_expr <- AverageExpression(object) | |
# ExpressionTableSpecies1 <- object_avg_expr[!rownames(object_avg_expr) %in% object_noisy_genes,] | |
NoisyGenes <- function(object, min.pct=0.2, clusters.use) { | |
clusters <- clusters.use | |
genes.use = rownames(object@data) | |
object.raw.data <- as.matrix(object@raw.data) | |
pct.matrix = matrix(data=NA, nrow=length(genes.use), ncol=length(clusters)) | |
rownames(pct.matrix) <- genes.use | |
colnames(pct.matrix) <- clusters | |
thresh.min=0 | |
for (i in clusters){ | |
cells.cluster <- names(object@ident[object@ident==i]) | |
data.cluster <- object.raw.data[,colnames(object.raw.data) %in% cells.cluster] | |
pct.cluster <- round(apply(object.raw.data[genes.use, cells.cluster, drop = F],1,function(x)return(length(x[x>thresh.min])/length(x))),3) | |
pct.matrix[,i] <- pct.cluster | |
} | |
pct.max <- rowMaxs(pct.matrix) | |
names(pct.max) <- genes.use | |
noisy.genes <- names(pct.max[pct.max < min.pct]) | |
return(noisy.genes) | |
} | |