final updates

MPIBR · May 3, 2018 · f03cddf · f03cddf
1 parent fe016fb
commit f03cddf
Show file tree

Hide file tree

Showing 13 changed files with 965 additions and 2,463 deletions.
diff --git a/Comparative_analysis/ComparativeAnalysis_CleanFunctions.R b/Comparative_analysis/ComparativeAnalysis_CleanFunctions.R
diff --git a/...rative_analysis/Comparative_Permutation.R → Comparative_analysis/Comparative_functions.R b/...rative_analysis/Comparative_Permutation.R → Comparative_analysis/Comparative_functions.R
@@ -1,311 +1,32 @@
-#######################Specificity Correlation, Permutation Test################
-##########################Intersect of DEgene lists#############################
 
-#input:
-#ExpressionTableSpecies1 - must be a data.frame
-#species1 = c('turtle','lizard','mouse')
-#DEgenesSpecies1
-#ExpressionTableSpecies2 - must be a data.frame
-#species1 = c('turtle','lizard','mouse','human','chicken')
-#DEgenesSpecies2
-#nPermutations = 1000
-
-#returns:
-#list with
-#corr.coeff
-#shuffled_correlation_score_means
-#p.value
-#negative_log10_p.value
-#DEgenes_intersect
-#DEgenes_in_analysis
-#nDEgenes_intersect
-#nDEgenes_in_analysis
-#nDEgenes_Sp1
-#nDEgenes_Sp2
-
-SpIntPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000){
-
-  #Step1:  Load eggnog tables     
-  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="mouse") {
-    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
-    DEgenesSpecies1 = toupper(DEgenesSpecies1)
-    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
-  }
-  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="mouse") {
-    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-    DEgenesSpecies2 = toupper(DEgenesSpecies2)
-    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-  }
-  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-
-  colnames(eggnog1) = eggnog1[1,]
-  eggnog1 = eggnog1[2:nrow(eggnog1),]
-  colnames(eggnog2) = eggnog2[1,]
-  eggnog2 = eggnog2[2:nrow(eggnog2),]
-
-  #Step2: Take intersect of DEgenes for analysis
-  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-  nDESp1 = length(DEgenesSpecies1)
-  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-  nDESp2 = length(DEgenesSpecies2)
-  DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
-  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-
-  #Step3: Prune Expression Tables & Remove rows with no expression
-  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-  Sp1 = Sp1[rowSums (Sp1)!=0,]
-  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-  Sp2 = Sp2[rowSums (Sp2)!=0,]
-
-  #Step4: Replace Gene names with Eggnog name
-  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-  colnames(Sp1)[ncol(Sp1)] = species1
-  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-  rownames(Sp1) = Sp1$eggnog
-  Sp1 = Sp1[,3:ncol(Sp1)]
-
-  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-  colnames(Sp2)[ncol(Sp2)] = species2
-  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-  rownames(Sp2) = Sp2$eggnog
-  Sp2 = Sp2[,3:ncol(Sp2)]
-
-  #Step5: Scale Expression Tables by gene average
-  avg = rowMeans(Sp1)
-  Sp1 = sweep(Sp1,1,avg,"/")
-  rm(avg)
-  avg = rowMeans(Sp2)
-  Sp2 = sweep(Sp2,1,avg,"/")
-  rm(avg)
-
-  #Step6: Merge Expression Tables
-  geTable = merge(Sp1,Sp2, by='row.names', all=F)
-  rownames(geTable) = geTable$Row.names
-  geTable = geTable[,2:ncol(geTable)]
-
-  #Step7:  Correlation
-  #7a:  Pearson correlation
-  Corr.Coeff.Table = cor(geTable,method='pearson')
-
-  #7b:  Shuffle data
-  shuffled.cor.list = list()
-  pb   <- txtProgressBar(1, 100, style=3)
-
-  for (i in 1:nPermutations){
-    shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
-    shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
-    shuffled = cbind(t(shuffled),t(shuffled2))
-    shuffled.cor = cor(shuffled,method='pearson')
-    shuffled.cor.list[[i]] = shuffled.cor
-    rm(list=c('shuffled','shuffled2','shuffled.cor'))
-    if ((i %% 100) ==0){
-      setTxtProgressBar(pb, (i*100)/nPermutations)
-    }
-  }
-
-  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(p.value.table) = colnames(geTable)
-  colnames(p.value.table) = colnames(geTable)
-
-  shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(shuffled.mean.table) = colnames(geTable)
-  colnames(shuffled.mean.table) = colnames(geTable)
-
-  a = combn(1:ncol(geTable),2)
-  pb2   <- txtProgressBar(1, 100, style=3)
-  for (i in 1:ncol(a)){
-    cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
-    shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
-    shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
-    p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
-    p.value.table[a[1,i],a[2,i]] = p.value
-    p.value.table[a[2,i],a[1,i]] = p.value
-    rm(list=c('cor.scores','p.value'))
-    setTxtProgressBar(pb2, (i*100)/ncol(a))
-  }
-
-  neg.log10.p = -log10(p.value.table)
-
-  list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
-  names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2')
-  return(list.to.return)
-
-}
-
-##########################Union of DEgene lists#############################
-
-#input:
-#ExpressionTableSpecies1 - must be a data.frame
-#species1 = c('turtle','lizard','mouse')
-#DEgenesSpecies1
-#ExpressionTableSpecies2 - must be a data.frame
-#species1 = c('turtle','lizard','mouse','human','chicken')
-#DEgenesSpecies2
-#nPermutations = 1000
-
-#returns:
-#list with
-#corr.coeff
-#shuffled_correlation_score_means
-#p.value
-#negative_log10_p.value
-#DEgenes_intersect
-#DEgenes_in_analysis
-#nDEgenes_intersect
-#nDEgenes_in_analysis
-#nDEgenes_Sp1
-#nDEgenes_Sp2
-
-SpUniPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000){
-
-  #Step1:  Load eggnog tables     
-  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="mouse") {
-    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
-    DEgenesSpecies1 = toupper(DEgenesSpecies1)
-    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
-  }
-  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="mouse") {
-    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-    DEgenesSpecies2 = toupper(DEgenesSpecies2)
-    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-  }
-  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-
-  colnames(eggnog1) = eggnog1[1,]
-  eggnog1 = eggnog1[2:nrow(eggnog1),]
-  colnames(eggnog2) = eggnog2[1,]
-  eggnog2 = eggnog2[2:nrow(eggnog2),]
-
-  #Step2: Take union of DEgenes for analysis
-  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-  nDESp1 = length(DEgenesSpecies1)
-  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-  nDESp2 = length(DEgenesSpecies2)
-  DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
-  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-
-  #Step3: Prune Expression Tables & Remove rows with no expression
-  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-  Sp1 = Sp1[rowSums (Sp1)!=0,]
-  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-  Sp2 = Sp2[rowSums (Sp2)!=0,]
-
-  #Step4: Replace Gene names with Eggnog name
-  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-  colnames(Sp1)[ncol(Sp1)] = species1
-  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-  rownames(Sp1) = Sp1$eggnog
-  Sp1 = Sp1[,3:ncol(Sp1)]
-
-  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-  colnames(Sp2)[ncol(Sp2)] = species2
-  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-  rownames(Sp2) = Sp2$eggnog
-  Sp2 = Sp2[,3:ncol(Sp2)]
-
-  #Step5: Scale Expression Tables by gene average
-  avg = rowMeans(Sp1)
-  Sp1 = sweep(Sp1,1,avg,"/")
-  rm(avg)
-  avg = rowMeans(Sp2)
-  Sp2 = sweep(Sp2,1,avg,"/")
-  rm(avg)
-
-  #Step6: Merge Expression Tables
-  geTable = merge(Sp1,Sp2, by='row.names', all=F)
-  rownames(geTable) = geTable$Row.names
-  geTable = geTable[,2:ncol(geTable)]
-
-  #Step7:  Correlation
-  #7a:  Pearson correlation
-  Corr.Coeff.Table = cor(geTable,method='pearson')
-
-  #7b:  Shuffle data
-  shuffled.cor.list = list()
-  pb   <- txtProgressBar(1, 100, style=3)
-
-  for (i in 1:nPermutations){
-    shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
-    shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
-    shuffled = cbind(t(shuffled),t(shuffled2))
-    shuffled.cor = cor(shuffled,method='pearson')
-    shuffled.cor.list[[i]] = shuffled.cor
-    rm(list=c('shuffled','shuffled2','shuffled.cor'))
-    if ((i %% 100) ==0){
-      setTxtProgressBar(pb, (i*100)/nPermutations)
-    }
-  }
-
-  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(p.value.table) = colnames(geTable)
-  colnames(p.value.table) = colnames(geTable)
-
-  shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(shuffled.mean.table) = colnames(geTable)
-  colnames(shuffled.mean.table) = colnames(geTable)
-
-  a = combn(1:ncol(geTable),2)
-  pb2   <- txtProgressBar(1, 100, style=3)
-  for (i in 1:ncol(a)){
-    cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
-    shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
-    shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
-    p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
-    p.value.table[a[1,i],a[2,i]] = p.value
-    p.value.table[a[2,i],a[1,i]] = p.value
-    rm(list=c('cor.scores','p.value'))
-    setTxtProgressBar(pb2, (i*100)/ncol(a))
-  }
-
-  neg.log10.p = -log10(p.value.table)
-
-  list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
-  names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2')
-  return(list.to.return)
-
-}
-
-#######################General Correlation (choose method), Permutation Test################
-
-#input:
-#ExpressionTableSpecies1 - must be a data.frame
-#species1 = c('turtle','lizard','mouse')
-#DEgenesSpecies1
-#ExpressionTableSpecies2 - must be a data.frame
-#species1 = c('turtle','lizard','mouse','human','chicken')
-#DEgenesSpecies2
-#nPermutations = 1000
-#genes.use = c('intersect','union','species1','species2')
-#corr.method = c('spearman', 'pearson') etc.
+################# General Correlation (choose method), Permutation Test, all genes ################
+
+# requires:
+# tables with the gene functional annotations generated by eggnog
+
+# input:
+# ExpressionTableSpecies1 - must be a data.frame
+# species1 = c('turtle','lizard','mouse')
+# DEgenesSpecies1
+# ExpressionTableSpecies2 - must be a data.frame
+# species1 = c('turtle','lizard','mouse','human','chicken')
+# DEgenesSpecies2
+# nPermutations = 1000
+# genes.use = c('intersect','union','species1','species2')
+# corr.method = c('spearman', 'pearson') etc.
 
-#returns:
-#list with
-#corr.coeff
-#shuffled_correlation_score_means
-#p.value
-#negative_log10_p.value
-#DEgenes_intersect
-#DEgenes_in_analysis
-#nDEgenes_intersect
-#nDEgenes_in_analysis
-#nDEgenes_Sp1
-#nDEgenes_Sp2
+# returns:
+# list with
+# corr.coeff
+# shuffled_correlation_score_means
+# p.value
+# negative_log10_p.value
+# DEgenes_intersect
+# DEgenes_in_analysis
+# nDEgenes_intersect
+# nDEgenes_in_analysis
+# nDEgenes_Sp1
+# nDEgenes_Sp2
 
 SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){
 
@@ -442,7 +163,15 @@ SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mo
 }
 
 
-# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563	(transcriptional activator activity) and GO:0016564	(transcriptional repressor activity). See http://www.transcriptionfactor.org/index.cgi?About
+############ General Correlation (choose method), Permutation Test, transcription factors #############
+
+# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563	(transcriptional activator activity) and GO:0016564	(transcriptional repressor activity). 
+
+# requires:
+# tables with the gene functional annotations generated by eggnog
+# list of transcription factors
+
+# input and output as above
 
 SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){
 
@@ -585,8 +314,9 @@ SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','
 }
 
 
-# NoisyGenes function
-# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy and can be removed before running comparative analysis
+################# NoisyGenes function ################
+
+# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy, and can be removed before running comparative analysis
 # clusters.use = character vectors of the clusters ids to include in the analysis
 # Example for comparative analysis:
 # object_noisy_genes <- NoisyGenes(object, min.pct=0.2, clusters.use=as.character(levels(object@ident)))

diff --git a/Comparative_analysis/CorrComparePlot.R b/Comparative_analysis/CorrComparePlot.R
@@ -1,5 +1,25 @@
+# this function is a wrapper that computes and plots correlation matrices of cell types 
+# it takes as input average cell type matrices and differentially expressed genes
+# it calculates correlation significance with a permutation test (1000 permutations)
+# requires the functions in the file Comparative_functions.R: 
+# requires the package corrplot
 
+# input:
+# ExpressionTableSpecies1 = data.frame with the average expression data by cell type in Species 1 (e.g. output of the AverageExpression function in Seurat)
+# DEgenesSpecies1 = vector with the names of the genes differentially expressed among the cell types of the Species 1 set
+# ExpressionTableSpecies2 - data.frame with the average expression data by cell type in Species 2
+# DEgenesSpecies2 = vector with the names of the genes differentially expressed among the cell types of the Species 2 set
+# species1 = c('turtle','lizard','mouse')
+# species2 = c('turtle','lizard','mouse','human','chicken')
+# filename = to save plots (in pdf)
+# Height and Width are plot parameters
 
+# returns a plot and a list:
+# - plot with correlation matrices calculated by taking (1) the intersection of DE genes in the two species, (2) the union of DE genes, (3) only the DE genes in Species 1, (4) only the DE genes in Species 2. 
+# - plot with correlation matrices calculated by taking intersect, union, Species1 or Species2 differentially expressed transcription factors
+#
+# list with all the correlation matrices (intersect, intersect.TF, union, union.TF, species1, species1.TF, species2, species2.TF)
+
 
 CorrComparePlot <- function(ExpressionTableSpecies1, DEgenesSpecies1, ExpressionTableSpecies2, DEgenesSpecies2, Species1, Species2, filename, Height=12, Width=10) {