From f03cddfadff09c06fee06d67851bb0f46ed0aa97 Mon Sep 17 00:00:00 2001
From: MPIBR-toschesm <maria.tosches@brain.mpg.de>
Date: Wed, 2 May 2018 22:14:52 -0400
Subject: [PATCH] final updates

---
 .../ComparativeAnalysis_CleanFunctions.R      |  729 ------------
 ..._Permutation.R => Comparative_functions.R} |  348 +-----
 Comparative_analysis/CorrComparePlot.R        |   20 +
 Comparative_analysis/DEgenes_limma_clean.R    |   30 -
 Comparative_analysis/TY_comparative.R         | 1035 -----------------
 .../eggnog_mastertable_generation.R           |  125 --
 Comparative_analysis/someplots.R              |   41 -
 Data_plots/TY_Plotting.R                      |  150 ---
 Gene_network_analysis/MeanVarData.R           |  101 ++
 Gene_network_analysis/MeanVarData_function.R  |   38 -
 Gene_network_analysis/comparative_wgcna.R     |  598 ++++++++++
 Gene_network_analysis/wgcna_turtle.R          |  205 ++++
 README.md                                     |    8 +-
 13 files changed, 965 insertions(+), 2463 deletions(-)
 delete mode 100644 Comparative_analysis/ComparativeAnalysis_CleanFunctions.R
 rename Comparative_analysis/{Comparative_Permutation.R => Comparative_functions.R} (53%)
 delete mode 100644 Comparative_analysis/DEgenes_limma_clean.R
 delete mode 100644 Comparative_analysis/TY_comparative.R
 delete mode 100644 Comparative_analysis/eggnog_mastertable_generation.R
 delete mode 100644 Comparative_analysis/someplots.R
 delete mode 100644 Data_plots/TY_Plotting.R
 create mode 100644 Gene_network_analysis/MeanVarData.R
 delete mode 100644 Gene_network_analysis/MeanVarData_function.R
 create mode 100644 Gene_network_analysis/comparative_wgcna.R
 create mode 100644 Gene_network_analysis/wgcna_turtle.R

diff --git a/Comparative_analysis/ComparativeAnalysis_CleanFunctions.R b/Comparative_analysis/ComparativeAnalysis_CleanFunctions.R
deleted file mode 100644
index 03d1ded..0000000
--- a/Comparative_analysis/ComparativeAnalysis_CleanFunctions.R
+++ /dev/null
@@ -1,729 +0,0 @@
-#make sure these files are in your working directory:
-#chrysemys_eggnog_pruned.txt
-#pogona_eggnog_pruned.txt
-#gallus_eggnog_pruned.txt
-#homo_eggnog_pruned.txt
-#mus_eggnog_pruned.txt
-
-
-####################################METHOD1#####################
-####################################Specificity Correlation#####
-#scale to gene expression average
-
-SpecificityCorrelation = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
-     
-     #Step1:  Load eggnog tables     
-     if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species1=="mouse") {
-     eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
-     DEgenesSpecies1 = toupper(DEgenesSpecies1)
-     rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
-     }
-     if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="mouse") {
-          eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-          eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-          DEgenesSpecies2 = toupper(DEgenesSpecies2)
-          rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-     }
-     if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     
-     colnames(eggnog1) = eggnog1[1,]
-     eggnog1 = eggnog1[2:nrow(eggnog1),]
-     colnames(eggnog2) = eggnog2[1,]
-     eggnog2 = eggnog2[2:nrow(eggnog2),]
-     
-     #Step2: Take union of DEgenes for analysis
-     DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-     DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-     DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
-     DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-     DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-     
-     #Step3: Prune Expression Tables & Remove rows with no expression
-     Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-     Sp1 = Sp1[rowSums (Sp1)!=0,]
-     Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-     Sp2 = Sp2[rowSums (Sp2)!=0,]
-     
-     #Step4: Replace Gene names with Eggnog name
-     Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-     colnames(Sp1)[ncol(Sp1)] = species1
-     Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-     rownames(Sp1) = Sp1$eggnog
-     Sp1 = Sp1[,3:ncol(Sp1)]
-     
-     Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-     colnames(Sp2)[ncol(Sp2)] = species2
-     Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-     rownames(Sp2) = Sp2$eggnog
-     Sp2 = Sp2[,3:ncol(Sp2)]
-     
-     #Step5: Scale Expression Tables by gene average
-     avg = rowMeans(Sp1)
-     Sp1 = sweep(Sp1,1,avg,"/")
-     rm(avg)
-     avg = rowMeans(Sp2)
-     Sp2 = sweep(Sp2,1,avg,"/")
-     rm(avg)
-     
-     #Step6: Merge Expression Tables
-     geTable = merge(Sp1,Sp2, by='row.names', all=F)
-     rownames(geTable) = geTable$Row.names
-     geTable = geTable[,2:ncol(geTable)]
-     
-     #Step7:  Correlation
-     Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
-     rownames(Corr.Coeff.Table) = colnames(geTable)
-     colnames(Corr.Coeff.Table) = colnames(geTable)
-     
-     p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-     rownames(p.value.table) = colnames(geTable)
-     colnames(p.value.table) = colnames(geTable)
-     
-     a = combn(1:ncol(geTable),2)
-     
-     n4BHcorrect = ncol(geTable)*ncol(geTable)
-     
-     for (i in 1:ncol(a)){
-          cell_type1 = colnames(geTable)[a[1,i]]
-          cell_type2 = colnames(geTable)[a[2,i]]
-          b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
-          Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
-          Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
-          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-          rm(list =c('cell_type1', 'cell_type2','b'))
-     }
-     neg.log10.p = -log10(p.value.table)
-     list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
-     names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value','DEgenes_union','DEgenes_in_analysis','nDEgenes_union','nDEgenes_in_analysis',paste('nDEgenes',species1,sep='_'),paste('nDEgenes',species2,sep='_'))
-     return(list.to.return)
-     
-     }
-
-#Specificity Using Intersect of Gene Lists
-SpIntersect = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
-  
-  #Step1:  Load eggnog tables     
-  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="mouse") {
-    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
-    DEgenesSpecies1 = toupper(DEgenesSpecies1)
-    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
-  }
-  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="mouse") {
-    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-    DEgenesSpecies2 = toupper(DEgenesSpecies2)
-    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-  }
-  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  
-  colnames(eggnog1) = eggnog1[1,]
-  eggnog1 = eggnog1[2:nrow(eggnog1),]
-  colnames(eggnog2) = eggnog2[1,]
-  eggnog2 = eggnog2[2:nrow(eggnog2),]
-  
-  #Step2: Take union of DEgenes for analysis
-  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-  nDESp1 = length(DEgenesSpecies1)
-  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-  nDESp2 = length(DEgenesSpecies2)
-  DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
-  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-  
-  #Step3: Prune Expression Tables & Remove rows with no expression
-  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-  Sp1 = Sp1[rowSums (Sp1)!=0,]
-  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-  Sp2 = Sp2[rowSums (Sp2)!=0,]
-  
-  #Step4: Replace Gene names with Eggnog name
-  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-  colnames(Sp1)[ncol(Sp1)] = species1
-  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-  rownames(Sp1) = Sp1$eggnog
-  Sp1 = Sp1[,3:ncol(Sp1)]
-  
-  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-  colnames(Sp2)[ncol(Sp2)] = species2
-  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-  rownames(Sp2) = Sp2$eggnog
-  Sp2 = Sp2[,3:ncol(Sp2)]
-  
-  #Step5: Scale Expression Tables by gene average
-  avg = rowMeans(Sp1)
-  Sp1 = sweep(Sp1,1,avg,"/")
-  rm(avg)
-  avg = rowMeans(Sp2)
-  Sp2 = sweep(Sp2,1,avg,"/")
-  rm(avg)
-  
-  #Step6: Merge Expression Tables
-  geTable = merge(Sp1,Sp2, by='row.names', all=F)
-  rownames(geTable) = geTable$Row.names
-  geTable = geTable[,2:ncol(geTable)]
-  
-  #Step7:  Correlation
-  Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
-  rownames(Corr.Coeff.Table) = colnames(geTable)
-  colnames(Corr.Coeff.Table) = colnames(geTable)
-  
-  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(p.value.table) = colnames(geTable)
-  colnames(p.value.table) = colnames(geTable)
-  
-  a = combn(1:ncol(geTable),2)
-  
-  n4BHcorrect = ncol(geTable)*ncol(geTable)
-  
-  for (i in 1:ncol(a)){
-    cell_type1 = colnames(geTable)[a[1,i]]
-    cell_type2 = colnames(geTable)[a[2,i]]
-    b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
-    Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
-    Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
-    BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-    p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-    p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-    rm(list =c('cell_type1', 'cell_type2','b'))
-  }
-  neg.log10.p = -log10(p.value.table)
-  list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
-  names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis',paste('nDEgenes',species1,sep='_'),paste('nDEgenes',species2,sep='_'))
-  return(list.to.return)
-  
-}
-
-
-
-
-####################################METHOD2#####################
-####################################Rank Correlation#####
-#scale to gene expression average
-
-RankCorrelation = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
-     
-     #Step1:  Load eggnog tables     
-     if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="mouse") {
-          eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-          eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-          DEgenesSpecies2 = toupper(DEgenesSpecies2)
-          rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-     }
-     if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     
-     colnames(eggnog1) = eggnog1[1,]
-     eggnog1 = eggnog1[2:nrow(eggnog1),]
-     colnames(eggnog2) = eggnog2[1,]
-     eggnog2 = eggnog2[2:nrow(eggnog2),]
-     
-     #Step2: Take union of DEgenes for analysis
-     DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-     DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-     DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
-     DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-     DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-     
-     #Step3: Prune Expression Tables 
-     Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-     Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-
-     #Step4: Replace Gene names with Eggnog name
-     Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-     colnames(Sp1)[ncol(Sp1)] = species1
-     Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-     rownames(Sp1) = Sp1$eggnog
-     Sp1 = Sp1[,3:ncol(Sp1)]
-     
-     Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-     colnames(Sp2)[ncol(Sp2)] = species2
-     Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-     rownames(Sp2) = Sp2$eggnog
-     Sp2 = Sp2[,3:ncol(Sp2)]
-     
-     #Step5: Merge Expression Tables
-     geTable = merge(Sp1,Sp2, by='row.names', all=F)
-     rownames(geTable) = geTable$Row.names
-     geTable = geTable[,2:ncol(geTable)]
-     
-     #Step6:  Correlation
-     rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
-     rownames(rho.table) = colnames(geTable)
-     colnames(rho.table) = colnames(geTable)
-     
-     p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-     rownames(p.value.table) = colnames(geTable)
-     colnames(p.value.table) = colnames(geTable)
-     
-     a = combn(1:ncol(geTable),2)
-     
-     n4BHcorrect = ncol(geTable)*ncol(geTable)
-     
-     for (i in 1:ncol(a)){
-          cell_type1 = colnames(geTable)[a[1,i]]
-          cell_type2 = colnames(geTable)[a[2,i]]
-          b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='spearman'))
-          rho.table[a[1,i],a[2,i]] = b$estimate
-          rho.table[a[2,i],a[1,i]] = b$estimate
-          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-          rm(list =c('cell_type1', 'cell_type2','b'))
-     }
-     neg.log10.p = -log10(p.value.table)
-     list.to.return = list(rho.table,p.value.table,neg.log10.p)
-     names(list.to.return) = c('rho','p.value','neg.log10.p.value')
-     return(list.to.return)
-     
-}
-
-####################Rank Correlation All Genes######
-
-
-RankCorrelationAll = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken')){
-     
-     #Step1:  Load eggnog tables     
-     if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="mouse") {
-          eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-          eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-          rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-     }
-     if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     
-     colnames(eggnog1) = eggnog1[1,]
-     eggnog1 = eggnog1[2:nrow(eggnog1),]
-     colnames(eggnog2) = eggnog2[1,]
-     eggnog2 = eggnog2[2:nrow(eggnog2),]
-     
-     #Step2: Replace Gene names with Eggnog name
-     Sp1 = ExpressionTableSpecies1
-     Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-     colnames(Sp1)[ncol(Sp1)] = species1
-     Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-     rownames(Sp1) = Sp1$eggnog
-     Sp1 = Sp1[,3:ncol(Sp1)]
-     
-     Sp2 = ExpressionTableSpecies2
-     Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-     colnames(Sp2)[ncol(Sp2)] = species2
-     Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-     rownames(Sp2) = Sp2$eggnog
-     Sp2 = Sp2[,3:ncol(Sp2)]
-     
-     #Step3: Merge Expression Tables
-     geTable = merge(Sp1,Sp2, by='row.names', all=F)
-     rownames(geTable) = geTable$Row.names
-     geTable = geTable[,2:ncol(geTable)]
-     
-     #Step4:  Correlation
-     rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
-     rownames(rho.table) = colnames(geTable)
-     colnames(rho.table) = colnames(geTable)
-     
-     p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-     rownames(p.value.table) = colnames(geTable)
-     colnames(p.value.table) = colnames(geTable)
-     
-     a = combn(1:ncol(geTable),2)
-     
-     n4BHcorrect = ncol(geTable)*ncol(geTable)
-     
-     for (i in 1:ncol(a)){
-          cell_type1 = colnames(geTable)[a[1,i]]
-          cell_type2 = colnames(geTable)[a[2,i]]
-          b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='spearman'))
-          rho.table[a[1,i],a[2,i]] = b$estimate
-          rho.table[a[2,i],a[1,i]] = b$estimate
-          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-          rm(list =c('cell_type1', 'cell_type2','b'))
-     }
-     neg.log10.p = -log10(p.value.table)
-     list.to.return = list(rho.table,p.value.table,neg.log10.p)
-     names(list.to.return) = c('rho','p.value','neg.log10.p.value')
-     return(list.to.return)
-     
-}
-
-########################METHOD 3#######
-########################hypergeometric distribution#####
-
-
-#Step 1:  generate list of markers for cell types
-#requires FindMarkers.MAST function
-
-Marker_List = function(SeuratObject,clusters,min.pct=0.4,thresh.use=log(1.5),fdr=0.05){
-     #step1a generate pairwise DEgenes by MAST
-     DEgenes = list()
-     combinations = t(combn(as.character(clusters),2)) 
-     colnames(combinations) <- c("ident.1","ident.2")
-     pb   <- txtProgressBar(1, 100, style=3)
-     for (i in 1:nrow(combinations)){
-          a = combinations[i,1]
-          b = combinations[i,2]
-          suppressMessages(diff.genes <- FindMarkers.MAST(SeuratObject,a,b,min.pct,thresh.use))
-          #matrix from FindMarkers.MAST only contains genes that have a fdr<0.05
-          DEgenes[[paste("id1",a,"id2",b, sep="_")]] = rownames(diff.genes[diff.genes$avg_diff>0 & diff.genes$fdr<fdr,])
-          DEgenes[[paste("id1",b,"id2",a, sep="_")]] = rownames(diff.genes[diff.genes$avg_diff<0 & diff.genes$fdr<fdr,])
-          rm(list = c('diff.genes','a','b'))
-          setTxtProgressBar(pb, (i*100)/nrow(combinations))
-     }
-     #step1b generate markers based on union of markers for a celltype
-     markers = list()
-     for (i in 1:length(clusters)) {
-          sub.list = DEgenes[grep(paste('id1_',clusters[i], "_",sep=""),names(DEgenes))]
-          gene.union = vector()
-          for (j in 1:length(sub.list)){
-               gene.union = union(gene.union,sub.list[[j]])
-          }
-          markers[[as.character(clusters)[i]]] = gene.union
-          rm(list=c("sub.list","gene.union"))
-     }
-     return(list(DEgenes,markers))
-}
-
-
-#Step 2:  replace markers with eggnog names
-#         and generate table with number of markers and number of eggnog markers per cell type
-
-Eggnog_list_replace = function(genelist_list, species = c('turtle','lizard','mouse','human','chicken')){
-     #Step1:  Load eggnog tables     
-     if (species=="turtle") eggnog = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species=="lizard") eggnog = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species=="mouse") {
-          eggnog = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-          eggnog[2:nrow(eggnog),1] = toupper(eggnog[2:nrow(eggnog),1])
-          for (i in 1:length(genelist_list)){
-               genelist_list[[i]] = toupper(genelist_list[[i]])
-          }
-     }
-     if (species=="human") eggnog = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     if (species=="chicken") eggnog = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-     
-     colnames(eggnog) = eggnog[1,]
-     eggnog = eggnog[2:nrow(eggnog),]
-
-     #Step2 replace names
-     eggnog_list = list()
-     for (i in 1:length(genelist_list)) {
-          eggnog_list[[i]] = eggnog[eggnog[,1] %in% genelist_list[[i]],2]
-          names(eggnog_list)[i] = names(genelist_list)[i]
-     }
-     
-     #Step3 generate summary table to append to eggnog_list
-     summary.matrix = cbind(sapply(genelist_list,length),sapply(eggnog_list,length))
-     colnames(summary.matrix) = c('nMarkers','nMarkers_eggnog')
-     rownames(summary.matrix) = names(genelist_list)
-     eggnog_list[['summary']] = summary.matrix
-     return(eggnog_list)
-}
-
-#Step 3:  p values for overlap
-
-hygecdf_matrix = function(eggnog.list1, sp1='tur', eggnog.list2, sp2='liz'){
-     names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
-     names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
-     summaries = list(eggnog.list1[[paste(sp1,'summary', sep="_")]], eggnog.list2[[paste(sp2,'summary', sep="_")]])
-     eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
-     eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
-     gene.list = c(eggnog.list1, eggnog.list2)
-     
-     hyg = matrix(nrow=length(gene.list),ncol=length(gene.list))
-     colnames(hyg) = c(names(gene.list))
-     rownames(hyg) = c(names(gene.list))
-     
-     JSC = matrix(nrow=length(gene.list),ncol=length(gene.list))
-     colnames(JSC) = c(names(gene.list))
-     rownames(JSC) = c(names(gene.list))
-     
-     nGenes = vector()
-     for(i in 1:length(gene.list)){
-          nGenes = union(nGenes,gene.list[[i]])
-     }
-     nGenes = length(nGenes)
-     #nGenes = 10089 #number of turtle v lizard eggnog genes
-     for (i in 1:length(gene.list)){
-          m = length(gene.list[[i]])
-          n = nGenes - m
-          n4BHcorrect = nrow(hyg)*ncol(hyg)
-          for (j in 1:length(gene.list)){
-               k = length(gene.list[[i]])
-               q = length(intersect(gene.list[[i]],gene.list[[j]]))
-               p.value = phyper(q,m,n,k, lower.tail=F)
-               #BH correct p values in hyg
-               p.value = p.adjust(p.value, method='BH',n=n4BHcorrect)
-               hyg[i,j] = p.value
-               u = length(union(gene.list[[i]],gene.list[[j]]))
-               JSC[i,j] = q/u
-               rm(list=c('k','q','u'))
-          }
-          rm(m)
-     }
-     negLog10 = -log10(hyg)
-     return.list = list(hyg,negLog10,JSC,summaries)
-     names(return.list) = c('hypergeometric_distribution','negative_log10','JSC','summaries')
-     return(return.list)
-}
-
-#Just Calculate Overlap
-overlap = function(eggnog.list1, sp1='sp1', eggnog.list2, sp2='sp2'){
-     names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
-     names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
-     eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
-     eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
-     gene.list = c(eggnog.list1, eggnog.list2)
-     
-     overlap.matrix = matrix(nrow=length(gene.list),ncol=length(gene.list))
-     colnames(overlap.matrix) = c(names(gene.list))
-     rownames(overlap.matrix) = c(names(gene.list))
-     
-     a = combn(1:ncol(overlap.matrix),2)
-     for (i in 1:ncol(a)){
-          cell1 = a[1,i]
-          cell2 = a[2,i]
-          nGenes = length(intersect(gene.list[[cell1]],gene.list[[cell2]]))
-          overlap.matrix[cell1,cell2] = nGenes
-          overlap.matrix[cell2,cell1] = nGenes
-     }
-     
-     for(i in 1:nrow(overlap.matrix)){
-          overlap.matrix[i,i] = length(gene.list[[i]])
-     }
-     return(overlap.matrix)
-}
-
-#Fake Binarization
-
-fake.binary = function(eggnog.list1, sp1='sp1', eggnog.list2, sp2='sp2'){
-     names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
-     names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
-     eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
-     eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
-     gene.list = c(eggnog.list1, eggnog.list2)
-     
-     #step1: create list of markers for analysis
-     markers.sp1 = vector()
-     for(i in 1:length(eggnog.list1)){
-          markers.sp1 = union(markers.sp1,eggnog.list1[[i]])
-     }
-     markers.sp2 = vector()
-     for(i in 1:length(eggnog.list2)){
-          markers.sp2 = union(markers.sp2,eggnog.list2[[i]])
-     }
-     
-     #markers = union(markers.sp1,markers.sp2)
-     markers = intersect(markers.sp1,markers.sp2)
-     
-     #step2:  create matrix of presence/absence of markers
-     
-     expression.matrix = data.frame(row.names=markers)
-     for (i in 1:length(gene.list)){
-          expression.matrix[rownames(expression.matrix) %in% gene.list[[i]],names(gene.list)[i]] = 1
-          expression.matrix[!(rownames(expression.matrix) %in% gene.list[[i]]),names(gene.list)[i]] = 0
-     }
-     
-     #step3:  correlation (pearson)
-     Corr.Coeff.Table = diag(x = 1, length(gene.list), length(gene.list))
-     rownames(Corr.Coeff.Table) = names(gene.list)
-     colnames(Corr.Coeff.Table) = names(gene.list)
-     
-     p.value.table = matrix(ncol=length(gene.list), nrow = length(gene.list))
-     rownames(p.value.table) = names(gene.list)
-     colnames(p.value.table) = names(gene.list)
-     
-     a = combn(1:length(gene.list),2)
-     
-     n4BHcorrect = length(gene.list)*length(gene.list)
-     
-     for (i in 1:ncol(a)){
-          cell_type1 = names(gene.list)[a[1,i]]
-          cell_type2 = names(gene.list)[a[2,i]]
-          b = suppressWarnings(cor.test(expression.matrix[,cell_type1], expression.matrix[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
-          Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
-          Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
-          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-          rm(list =c('cell_type1', 'cell_type2','b'))
-     }
-     neg.log10.p = -log10(p.value.table)
-     
-     #return
-     list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,expression.matrix)
-     names(list.to.return) = c('correlation_coeff','p_values','neg_log10_p','expression_table')
-     return(list.to.return)
-}
-
-##########################Old Scaling Methods###########
-#scaling.method options
-#'minmax' sets min at 0 and max at 1
-#'max' expression as fraction of max
-#'yao' expression according to Yao et al Cell Stem Cell 2017
-
-ScaledCorrelation = function(scaling.method = c('minmax','max','yao'),ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){
-  
-  #Step1:  Load eggnog tables     
-  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="mouse") {
-    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
-    DEgenesSpecies1 = toupper(DEgenesSpecies1)
-    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
-  }
-  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="mouse") {
-    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-    DEgenesSpecies2 = toupper(DEgenesSpecies2)
-    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-  }
-  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  
-  colnames(eggnog1) = eggnog1[1,]
-  eggnog1 = eggnog1[2:nrow(eggnog1),]
-  colnames(eggnog2) = eggnog2[1,]
-  eggnog2 = eggnog2[2:nrow(eggnog2),]
-  
-  #Step2: Take union of DEgenes for analysis
-  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-  DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
-  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-  
-  #Step3: Prune Expression Tables & Remove rows with no expression
-  ExpressionTableSpecies1 = ExpressionTableSpecies1[rowSums(ExpressionTableSpecies1)!=0,]
-  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-  ExpressionTableSpecies2 = ExpressionTableSpecies2[rowSums(ExpressionTableSpecies2)!=0,]
-  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-  
-  #Step4: Replace Gene names with Eggnog name
-  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-  colnames(Sp1)[ncol(Sp1)] = species1
-  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-  rownames(Sp1) = Sp1$eggnog
-  Sp1 = Sp1[,3:ncol(Sp1)]
-  
-  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-  colnames(Sp2)[ncol(Sp2)] = species2
-  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-  rownames(Sp2) = Sp2$eggnog
-  Sp2 = Sp2[,3:ncol(Sp2)]
-  
-  #Step5: Scale Expression Tables by scaling.method
-  if(scaling.method=='minmax'){
-    gene_min = apply(Sp1, 1, min)
-    Sp1 = sweep(Sp1,1,gene_min,"-")
-    gene_max = apply(Sp1, 1, max)
-    Sp1 = sweep(Sp1,1,gene_max,"/")
-    rm(list=c('gene_min','gene_max'))
-    gene_min = apply(Sp2, 1, min)
-    Sp2 = sweep(Sp2,1,gene_min,"-")
-    gene_max = apply(Sp2, 1, max)
-    Sp2 = sweep(Sp2,1,gene_max,"/")
-    rm(list=c('gene_min','gene_max'))
-  }
-  
-  if(scaling.method=='max'){
-    gene_max = apply(Sp1, 1, max)
-    Sp1 = sweep(Sp1,1,gene_max,"/")
-    rm(gene_max)
-    gene_max = apply(Sp2, 1, max)
-    Sp2 = sweep(Sp2,1,gene_max,"/")
-    rm(gene_max)
-    
-  }
-  
-  if(scaling.method=='yao'){
-      gene_max = apply(Sp1, 1, max)
-      gene_min = apply(Sp1, 1, min)
-      #Determine median for entire dataset, not just pruned data!!!
-      med = apply(ExpressionTableSpecies1,1,max)
-      med = median(med)
-      for (i in 1:nrow(Sp1)) {
-        if (gene_max[i]>med){
-          Sp1[i,] = sapply(Sp1[i,],function(x)(x-gene_min[i])/gene_max[i])
-        } 
-        else {
-          Sp1[i,] = sapply(Sp1[i,],function(x)(x-gene_min[i])/med)
-        }
-      }
-      rm(list=c('gene_max','gene_min','med'))
-      gene_max = apply(Sp2, 1, max)
-      gene_min = apply(Sp2, 1, min)
-      med = apply(ExpressionTableSpecies2,1,max)
-      med = median(med)
-      for (i in 1:nrow(Sp2)) {
-        if (gene_max[i]>med){
-          Sp2[i,] = sapply(Sp2[i,],function(x)(x-gene_min[i])/gene_max[i])
-        } 
-        else {
-          Sp2[i,] = sapply(Sp2[i,],function(x)(x-gene_min[i])/med)
-        }
-      }
-      rm(list=c('gene_max','gene_min','med'))
-      
-  }
-  
-  #Step6: Merge Expression Tables
-  geTable = merge(Sp1,Sp2, by='row.names', all=F)
-  rownames(geTable) = geTable$Row.names
-  geTable = geTable[,2:ncol(geTable)]
-  
-  #Step7:  Correlation
-  Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
-  rownames(Corr.Coeff.Table) = colnames(geTable)
-  colnames(Corr.Coeff.Table) = colnames(geTable)
-  
-  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(p.value.table) = colnames(geTable)
-  colnames(p.value.table) = colnames(geTable)
-  
-  a = combn(1:ncol(geTable),2)
-  
-  n4BHcorrect = ncol(geTable)*ncol(geTable)
-  
-  for (i in 1:ncol(a)){
-    cell_type1 = colnames(geTable)[a[1,i]]
-    cell_type2 = colnames(geTable)[a[2,i]]
-    b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
-    Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
-    Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
-    BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-    p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-    p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-    rm(list =c('cell_type1', 'cell_type2','b'))
-  }
-  neg.log10.p = -log10(p.value.table)
-  list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p)
-  names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value')
-  return(list.to.return)
-  
-}
diff --git a/Comparative_analysis/Comparative_Permutation.R b/Comparative_analysis/Comparative_functions.R
similarity index 53%
rename from Comparative_analysis/Comparative_Permutation.R
rename to Comparative_analysis/Comparative_functions.R
index 7627452..c1caf5e 100644
--- a/Comparative_analysis/Comparative_Permutation.R
+++ b/Comparative_analysis/Comparative_functions.R
@@ -1,311 +1,32 @@
-#######################Specificity Correlation, Permutation Test################
-##########################Intersect of DEgene lists#############################
 
-#input:
-#ExpressionTableSpecies1 - must be a data.frame
-#species1 = c('turtle','lizard','mouse')
-#DEgenesSpecies1
-#ExpressionTableSpecies2 - must be a data.frame
-#species1 = c('turtle','lizard','mouse','human','chicken')
-#DEgenesSpecies2
-#nPermutations = 1000
-
-#returns:
-#list with
-#corr.coeff
-#shuffled_correlation_score_means
-#p.value
-#negative_log10_p.value
-#DEgenes_intersect
-#DEgenes_in_analysis
-#nDEgenes_intersect
-#nDEgenes_in_analysis
-#nDEgenes_Sp1
-#nDEgenes_Sp2
-
-SpIntPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000){
-  
-  #Step1:  Load eggnog tables     
-  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="mouse") {
-    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
-    DEgenesSpecies1 = toupper(DEgenesSpecies1)
-    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
-  }
-  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="mouse") {
-    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-    DEgenesSpecies2 = toupper(DEgenesSpecies2)
-    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-  }
-  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  
-  colnames(eggnog1) = eggnog1[1,]
-  eggnog1 = eggnog1[2:nrow(eggnog1),]
-  colnames(eggnog2) = eggnog2[1,]
-  eggnog2 = eggnog2[2:nrow(eggnog2),]
-  
-  #Step2: Take intersect of DEgenes for analysis
-  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-  nDESp1 = length(DEgenesSpecies1)
-  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-  nDESp2 = length(DEgenesSpecies2)
-  DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
-  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-  
-  #Step3: Prune Expression Tables & Remove rows with no expression
-  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-  Sp1 = Sp1[rowSums (Sp1)!=0,]
-  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-  Sp2 = Sp2[rowSums (Sp2)!=0,]
-  
-  #Step4: Replace Gene names with Eggnog name
-  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-  colnames(Sp1)[ncol(Sp1)] = species1
-  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-  rownames(Sp1) = Sp1$eggnog
-  Sp1 = Sp1[,3:ncol(Sp1)]
-  
-  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-  colnames(Sp2)[ncol(Sp2)] = species2
-  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-  rownames(Sp2) = Sp2$eggnog
-  Sp2 = Sp2[,3:ncol(Sp2)]
-  
-  #Step5: Scale Expression Tables by gene average
-  avg = rowMeans(Sp1)
-  Sp1 = sweep(Sp1,1,avg,"/")
-  rm(avg)
-  avg = rowMeans(Sp2)
-  Sp2 = sweep(Sp2,1,avg,"/")
-  rm(avg)
-  
-  #Step6: Merge Expression Tables
-  geTable = merge(Sp1,Sp2, by='row.names', all=F)
-  rownames(geTable) = geTable$Row.names
-  geTable = geTable[,2:ncol(geTable)]
-  
-  #Step7:  Correlation
-  #7a:  Pearson correlation
-  Corr.Coeff.Table = cor(geTable,method='pearson')
-  
-  #7b:  Shuffle data
-  shuffled.cor.list = list()
-  pb   <- txtProgressBar(1, 100, style=3)
-
-  for (i in 1:nPermutations){
-    shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
-    shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
-    shuffled = cbind(t(shuffled),t(shuffled2))
-    shuffled.cor = cor(shuffled,method='pearson')
-    shuffled.cor.list[[i]] = shuffled.cor
-    rm(list=c('shuffled','shuffled2','shuffled.cor'))
-    if ((i %% 100) ==0){
-      setTxtProgressBar(pb, (i*100)/nPermutations)
-    }
-  }
-  
-  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(p.value.table) = colnames(geTable)
-  colnames(p.value.table) = colnames(geTable)
-  
-  shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(shuffled.mean.table) = colnames(geTable)
-  colnames(shuffled.mean.table) = colnames(geTable)
-  
-  a = combn(1:ncol(geTable),2)
-  pb2   <- txtProgressBar(1, 100, style=3)
-  for (i in 1:ncol(a)){
-    cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
-    shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
-    shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
-    p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
-    p.value.table[a[1,i],a[2,i]] = p.value
-    p.value.table[a[2,i],a[1,i]] = p.value
-    rm(list=c('cor.scores','p.value'))
-    setTxtProgressBar(pb2, (i*100)/ncol(a))
-  }
-
-  neg.log10.p = -log10(p.value.table)
-
-  list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
-  names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2')
-  return(list.to.return)
-
-}
-
-##########################Union of DEgene lists#############################
-
-#input:
-#ExpressionTableSpecies1 - must be a data.frame
-#species1 = c('turtle','lizard','mouse')
-#DEgenesSpecies1
-#ExpressionTableSpecies2 - must be a data.frame
-#species1 = c('turtle','lizard','mouse','human','chicken')
-#DEgenesSpecies2
-#nPermutations = 1000
-
-#returns:
-#list with
-#corr.coeff
-#shuffled_correlation_score_means
-#p.value
-#negative_log10_p.value
-#DEgenes_intersect
-#DEgenes_in_analysis
-#nDEgenes_intersect
-#nDEgenes_in_analysis
-#nDEgenes_Sp1
-#nDEgenes_Sp2
-
-SpUniPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000){
-  
-  #Step1:  Load eggnog tables     
-  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species1=="mouse") {
-    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
-    DEgenesSpecies1 = toupper(DEgenesSpecies1)
-    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
-  }
-  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="mouse") {
-    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
-    DEgenesSpecies2 = toupper(DEgenesSpecies2)
-    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
-  }
-  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
-  
-  colnames(eggnog1) = eggnog1[1,]
-  eggnog1 = eggnog1[2:nrow(eggnog1),]
-  colnames(eggnog2) = eggnog2[1,]
-  eggnog2 = eggnog2[2:nrow(eggnog2),]
-  
-  #Step2: Take union of DEgenes for analysis
-  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
-  nDESp1 = length(DEgenesSpecies1)
-  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
-  nDESp2 = length(DEgenesSpecies2)
-  DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
-  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
-  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]
-  
-  #Step3: Prune Expression Tables & Remove rows with no expression
-  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
-  Sp1 = Sp1[rowSums (Sp1)!=0,]
-  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
-  Sp2 = Sp2[rowSums (Sp2)!=0,]
-  
-  #Step4: Replace Gene names with Eggnog name
-  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
-  colnames(Sp1)[ncol(Sp1)] = species1
-  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
-  rownames(Sp1) = Sp1$eggnog
-  Sp1 = Sp1[,3:ncol(Sp1)]
-  
-  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
-  colnames(Sp2)[ncol(Sp2)] = species2
-  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
-  rownames(Sp2) = Sp2$eggnog
-  Sp2 = Sp2[,3:ncol(Sp2)]
-  
-  #Step5: Scale Expression Tables by gene average
-  avg = rowMeans(Sp1)
-  Sp1 = sweep(Sp1,1,avg,"/")
-  rm(avg)
-  avg = rowMeans(Sp2)
-  Sp2 = sweep(Sp2,1,avg,"/")
-  rm(avg)
-  
-  #Step6: Merge Expression Tables
-  geTable = merge(Sp1,Sp2, by='row.names', all=F)
-  rownames(geTable) = geTable$Row.names
-  geTable = geTable[,2:ncol(geTable)]
-  
-  #Step7:  Correlation
-  #7a:  Pearson correlation
-  Corr.Coeff.Table = cor(geTable,method='pearson')
-  
-  #7b:  Shuffle data
-  shuffled.cor.list = list()
-  pb   <- txtProgressBar(1, 100, style=3)
-
-  for (i in 1:nPermutations){
-    shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
-    shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
-    shuffled = cbind(t(shuffled),t(shuffled2))
-    shuffled.cor = cor(shuffled,method='pearson')
-    shuffled.cor.list[[i]] = shuffled.cor
-    rm(list=c('shuffled','shuffled2','shuffled.cor'))
-    if ((i %% 100) ==0){
-      setTxtProgressBar(pb, (i*100)/nPermutations)
-    }
-  }
-  
-  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(p.value.table) = colnames(geTable)
-  colnames(p.value.table) = colnames(geTable)
-  
-  shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(shuffled.mean.table) = colnames(geTable)
-  colnames(shuffled.mean.table) = colnames(geTable)
-  
-  a = combn(1:ncol(geTable),2)
-  pb2   <- txtProgressBar(1, 100, style=3)
-  for (i in 1:ncol(a)){
-    cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
-    shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
-    shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
-    p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
-    p.value.table[a[1,i],a[2,i]] = p.value
-    p.value.table[a[2,i],a[1,i]] = p.value
-    rm(list=c('cor.scores','p.value'))
-    setTxtProgressBar(pb2, (i*100)/ncol(a))
-  }
-
-  neg.log10.p = -log10(p.value.table)
-
-  list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
-  names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2')
-  return(list.to.return)
-
-}
-
-#######################General Correlation (choose method), Permutation Test################
-
-#input:
-#ExpressionTableSpecies1 - must be a data.frame
-#species1 = c('turtle','lizard','mouse')
-#DEgenesSpecies1
-#ExpressionTableSpecies2 - must be a data.frame
-#species1 = c('turtle','lizard','mouse','human','chicken')
-#DEgenesSpecies2
-#nPermutations = 1000
-#genes.use = c('intersect','union','species1','species2')
-#corr.method = c('spearman', 'pearson') etc.
+################# General Correlation (choose method), Permutation Test, all genes ################
+
+# requires:
+# tables with the gene functional annotations generated by eggnog
+
+# input:
+# ExpressionTableSpecies1 - must be a data.frame
+# species1 = c('turtle','lizard','mouse')
+# DEgenesSpecies1
+# ExpressionTableSpecies2 - must be a data.frame
+# species1 = c('turtle','lizard','mouse','human','chicken')
+# DEgenesSpecies2
+# nPermutations = 1000
+# genes.use = c('intersect','union','species1','species2')
+# corr.method = c('spearman', 'pearson') etc.
     
-#returns:
-#list with
-#corr.coeff
-#shuffled_correlation_score_means
-#p.value
-#negative_log10_p.value
-#DEgenes_intersect
-#DEgenes_in_analysis
-#nDEgenes_intersect
-#nDEgenes_in_analysis
-#nDEgenes_Sp1
-#nDEgenes_Sp2
+# returns:
+# list with
+# corr.coeff
+# shuffled_correlation_score_means
+# p.value
+# negative_log10_p.value
+# DEgenes_intersect
+# DEgenes_in_analysis
+# nDEgenes_intersect
+# nDEgenes_in_analysis
+# nDEgenes_Sp1
+# nDEgenes_Sp2
     
 SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){
   
@@ -442,7 +163,15 @@ SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mo
 }
 
 
-# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563	(transcriptional activator activity) and GO:0016564	(transcriptional repressor activity). See http://www.transcriptionfactor.org/index.cgi?About
+############ General Correlation (choose method), Permutation Test, transcription factors #############
+
+# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563	(transcriptional activator activity) and GO:0016564	(transcriptional repressor activity). 
+
+# requires:
+# tables with the gene functional annotations generated by eggnog
+# list of transcription factors
+
+# input and output as above
 
 SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){
   
@@ -585,8 +314,9 @@ SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','
 }
 
 
-# NoisyGenes function
-# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy and can be removed before running comparative analysis
+################# NoisyGenes function ################
+
+# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy, and can be removed before running comparative analysis
 # clusters.use = character vectors of the clusters ids to include in the analysis
 # Example for comparative analysis:
 # object_noisy_genes <- NoisyGenes(object, min.pct=0.2, clusters.use=as.character(levels(object@ident)))
diff --git a/Comparative_analysis/CorrComparePlot.R b/Comparative_analysis/CorrComparePlot.R
index 4d41e44..7280730 100644
--- a/Comparative_analysis/CorrComparePlot.R
+++ b/Comparative_analysis/CorrComparePlot.R
@@ -1,5 +1,25 @@
+# this function is a wrapper that computes and plots correlation matrices of cell types 
+# it takes as input average cell type matrices and differentially expressed genes
+# it calculates correlation significance with a permutation test (1000 permutations)
+# requires the functions in the file Comparative_functions.R: 
+# requires the package corrplot
 
+# input:
+# ExpressionTableSpecies1 = data.frame with the average expression data by cell type in Species 1 (e.g. output of the AverageExpression function in Seurat)
+# DEgenesSpecies1 = vector with the names of the genes differentially expressed among the cell types of the Species 1 set
+# ExpressionTableSpecies2 - data.frame with the average expression data by cell type in Species 2
+# DEgenesSpecies2 = vector with the names of the genes differentially expressed among the cell types of the Species 2 set
+# species1 = c('turtle','lizard','mouse')
+# species2 = c('turtle','lizard','mouse','human','chicken')
+# filename = to save plots (in pdf)
+# Height and Width are plot parameters
 
+# returns a plot and a list:
+# - plot with correlation matrices calculated by taking (1) the intersection of DE genes in the two species, (2) the union of DE genes, (3) only the DE genes in Species 1, (4) only the DE genes in Species 2. 
+# - plot with correlation matrices calculated by taking intersect, union, Species1 or Species2 differentially expressed transcription factors
+#
+# list with all the correlation matrices (intersect, intersect.TF, union, union.TF, species1, species1.TF, species2, species2.TF)
+ 
 
 CorrComparePlot <- function(ExpressionTableSpecies1, DEgenesSpecies1, ExpressionTableSpecies2, DEgenesSpecies2, Species1, Species2, filename, Height=12, Width=10) {
 	
diff --git a/Comparative_analysis/DEgenes_limma_clean.R b/Comparative_analysis/DEgenes_limma_clean.R
deleted file mode 100644
index 31ab0be..0000000
--- a/Comparative_analysis/DEgenes_limma_clean.R
+++ /dev/null
@@ -1,30 +0,0 @@
-# required packages
-library(useful)
-library(limma)
-library(Biobase)
-
-# create an ExpressionSet object. Inputs: (1) a normalized expression table, in log space, cleaned from low expressed and non-informative genes (2) a metadata table, with the names of the areas assayed.
-palliumSet <- ExpressionSet(assayData = as.matrix(pallium.final2), phenoData = metaPallium)
-
-# build the Design matrix. ~0 removes the intercept. Use an intercept (i.e. do not type ~0) only if the first sample should be the baseline. in our case, we do not have any baseline
-design <- model.matrix(~0 + metadata$area) 
-colnames(design) <- levels(metadata$area)
-
-
-fit <- lmFit(palliumSet, design)
-
-areas <- levels(metadata$area)
-contr <- combn(areas,2)
-contrasts <- apply(contr, 2, function(x) paste(x[1],x[2],sep="-"))
-contrast.matrix <- makeContrasts(contrasts=contrasts, levels=design)
-fit2 <- contrasts.fit(fit, contrast.matrix)
-fit2 <- eBayes(fit2)
-topTable(fit2, coef=1, adjust="BH")
-results <- decideTests(fit2)
-summary(results)
-
-# Extract the list of significantly DEgenes, with a certain p.value and a min lfc. lfc = minimum log2-fold-change required
-topHits <- topTable(fit2, number=Inf, p.value=1e-05, coef=c(1:ncol(contrast.matrix)), adjust.method="holm", lfc=2)
-topGenes <- rownames(topHits) 
-
-
diff --git a/Comparative_analysis/TY_comparative.R b/Comparative_analysis/TY_comparative.R
deleted file mode 100644
index b5abb5e..0000000
--- a/Comparative_analysis/TY_comparative.R
+++ /dev/null
@@ -1,1035 +0,0 @@
-#######################################################
-# correlation plot scaled to average expression of a gene
-lizard_avgGene_avgScaled = lizard_avgGene[,2:9]
-rownames(lizard_avgGene_avgScaled) = lizard_avgGene$eggnog
-lizard_avgGene_avgScaled = lizard_avgGene_avgScaled[rowSums (lizard_avgGene_avgScaled)!=0,]
-gene_avg = rowSums(lizard_avgGene_avgScaled)/ncol(lizard_avgGene_avgScaled)
-lizard_avgGene_avgScaled = sweep(lizard_avgGene_avgScaled,1,gene_avg,"/")
-rm(gene_avg)
-
-turtle_avgGene_avgScaled = turtle_avgGene[,2:9]
-rownames(turtle_avgGene_avgScaled) = turtle_avgGene$eggnog
-turtle_avgGene_avgScaled = turtle_avgGene_avgScaled[rowSums (turtle_avgGene_avgScaled)!=0,]
-gene_avg = rowSums(turtle_avgGene_avgScaled)/ncol(turtle_avgGene_avgScaled)
-turtle_avgGene_avgScaled = sweep(turtle_avgGene_avgScaled,1,gene_avg,"/")
-rm(gene_avg)
-
-tl_avgScaled = merge(turtle_avgGene_avgScaled, lizard_avgGene_avgScaled, by = 'row.names', all=F)
-rownames(tl_avgScaled) = tl_avgScaled[,1]
-tl_avgScaled = tl_avgScaled[,2:ncol(tl_avgScaled)]
-
-#subset & correlation matrix
-
-tl_avgScaled_union = tl_avgScaled[rownames(tl_avgScaled) %in% lt_gene_union,]
-cor.avgScaled.union.pearson = cor(tl_avgScaled_union, method = "pearson")
-tl_avgScaled_intersect = tl_avgScaled[rownames(tl_avgScaled) %in% lt_gene_intersect,]
-cor.avgScaled.intersect.pearson = cor(tl_avgScaled_intersect, method = "pearson")
-
-
-
-setwd('/storage/laur/Data_2/YamawakiTracy/dropseq/R/lizard')
-setwd("/storage/laur/Data_2/YamawakiTracy/dropseq/R/comparative")
-
-#lizard_eggnog table
-eggnog_lizard = read.table("eggnog_lizard_listDec2016.txt",header=F,stringsAsFactors=F)
-duplicates = unique(eggnog_lizard[duplicated(eggnog_lizard[,2]),2])
-eggnog_lizard = eggnog_lizard[!eggnog_lizard[,2]%in% duplicates,]
-duplicates2 = unique(eggnog_lizard[duplicated(eggnog_lizard[,1]),1])
-eggnog_lizard = eggnog_lizard[!eggnog_lizard[,1]%in% duplicates2,]
-colnames(eggnog_lizard) = c("lizard","eggnog")
-
-#turtle_eggnog table
-eggnog_turtle = read.table("eggnog_turtle_listDec2016.txt",header=F,stringsAsFactors=F)
-duplicates = unique(eggnog_turtle[duplicated(eggnog_turtle[,2]),2])
-eggnog_turtle = eggnog_turtle[!eggnog_turtle[,2]%in% duplicates,]
-duplicates2 = unique(eggnog_turtle[duplicated(eggnog_turtle[,1]),1])
-eggnog_turtle = eggnog_turtle[!eggnog_turtle[,1]%in% duplicates2,]
-colnames(eggnog_turtle) = c("turtle","eggnog")
-
-#merge eggnog_lizard and eggnog_turtle only keeping the overlap
-
-eggnog_merge = merge(eggnog_lizard,eggnog_turtle,by = 'eggnog', all=F)
-
-#10089 genes
-
-#lizard data
-load('170208_lizard_libraryNorm_ComBatLiz_500genes_SVM_filtered_filtered_.Robj')
-a = as.factor(liz@data.info$cell_type)
-names(a) = rownames(liz@data.info)
-liz@ident = a
-rm(a)
-lizard_avgGene = AverageExpression(liz)
-colnames(lizard_avgGene) = paste("lizard", colnames(lizard_avgGene), sep="_")
-lizard_avgGene$lizard = rownames(lizard_avgGene)
-lizard_avgGene = merge(eggnog_merge, lizard_avgGene, by ="lizard", all = F)
-lizard_avgGene$lizard = NULL
-lizard_avgGene$turtle = NULL
-
-#turtle data
-load('170210_turtle_filt_combat_.Robj')
-a = as.factor(turtle@data.info$cell_type)
-names(a) = rownames(turtle@data.info)
-turtle@ident = a
-rm(a)
-turtle_avgGene = AverageExpression(turtle)
-colnames(turtle_avgGene) = paste("turtle", colnames(turtle_avgGene), sep="_")
-turtle_avgGene$turtle = rownames(turtle_avgGene)
-turtle_avgGene = merge(eggnog_merge, turtle_avgGene, by ="turtle", all = F)
-turtle_avgGene$lizard = NULL
-turtle_avgGene$turtle = NULL
-
-#merge lizard and turtle data
-
-tl = merge(turtle_avgGene,lizard_avgGene, by='eggnog', all=F)
-rownames(tl) = tl$eggnog
-tl$eggnog = NULL
-
-#remove genes that have no expression in cell-types being compared
-#returns Benjamini-Hochberg corrected p values
-no_zeros_correlation = function(geTable){
-     rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
-     rownames(rho.table) = colnames(geTable)
-     colnames(rho.table) = colnames(geTable)
-     
-     p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-     rownames(p.value.table) = colnames(geTable)
-     colnames(p.value.table) = colnames(geTable)
-
-     a = combn(1:ncol(geTable),2)
-     
-     n4BHcorrect = ncol(geTable)*ncol(geTable)
-     
-     for (i in 1:ncol(a)){
-          cell_type1 = colnames(geTable)[a[1,i]]
-          cell_type2 = colnames(geTable)[a[2,i]]
-          data.to.use = geTable[,c(cell_type1,cell_type2)]
-          #remove rows with only 0's
-          data.to.use = data.to.use[rowSums(data.to.use)!=0,] 
-          b = suppressWarnings(cor.test(data.to.use[,1], data.to.use[,2], adjust='none',alternative = "two.sided", method='spearman'))
-          rho.table[a[1,i],a[2,i]] = b$estimate
-          rho.table[a[2,i],a[1,i]] = b$estimate
-          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-          rm(list =c('data.to.use','cell_type1', 'cell_type2','b'))
-     }
-     neg.log10.p = -log10(p.value.table)
-     list.to.return = list(rho.table,p.value.table,neg.log10.p)
-     names(list.to.return) = c('rho','p.value','neg.log10.p.value')
-     return(list.to.return)
-}
-
-PlotCorrelation = function(cor.matrix,plot.title){
-col1 <- colorRampPalette(c("#7F0000","red", "#FF7F00","#007FFF", "blue","#00007F"))
-pdf(paste(plot.title,'.pdf',sep=''))
-#plot correlation matrix
-corrplot.mixed(cor.matrix,title=plot.title, lower='number', upper='square',tl.pos='lt',number.cex=0.4,col=col1(100))  
-dev.off()
-}
-#plot -log10 pvalues
-test = cell_type_correlation(tl,0.99)
-melted = melt(test[[3]])
-pdf('Correlation_Spearman_99Percentile_pvalues.pdf')
-ggplot(melted, aes(Var1, Var2)) + geom_tile(aes(fill = value)) + geom_text(aes(label = round(value, 1)),size=2,angle=90) + scale_fill_gradient(low = "white", high = "red")
-dev.off()
-
-#correlation only with genes that have expression in both cell-types being compared
-#returns Benjamini-Hochberg corrected p values
-
-cell_type_correlation = function(geTable,percentile){
-  rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
-  rownames(rho.table) = colnames(geTable)
-  colnames(rho.table) = colnames(geTable)
-  
-  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
-  rownames(p.value.table) = colnames(geTable)
-  colnames(p.value.table) = colnames(geTable)
-  
-  a = combn(1:ncol(geTable),2)
-  
-  n4BHcorrect = ncol(geTable)*ncol(geTable)
-  
-  for (i in 1:ncol(a)){
-    cell_type1 = colnames(geTable)[a[1,i]]
-    cell_type2 = colnames(geTable)[a[2,i]]
-    data.to.use = geTable[,c(cell_type1,cell_type2)]
-    #remove rows where both values below percentile
-    cutoff1 = quantile(data.to.use[,1],percentile)
-    cutoff2 = quantile(data.to.use[,2],percentile) 
-    data.to.use = data.to.use[data.to.use[,1]>cutoff1 | data.to.use[,2]>cutoff2,] 
-    b = suppressWarnings(cor.test(data.to.use[,1], data.to.use[,2], adjust='none',alternative = "two.sided", method='spearman'))
-    rho.table[a[1,i],a[2,i]] = b$estimate
-    rho.table[a[2,i],a[1,i]] = b$estimate
-    BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
-    p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
-    p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
-    rm(list =c('data.to.use','cell_type1', 'cell_type2','b','cutoff1','cutoff2'))
-  }
-  neg.log10.p = -log10(p.value.table)
-  list.to.return = list(rho.table,p.value.table,neg.log10.p)
-  names(list.to.return) = c('rho','p.value','neg.log10.p.value')
-  return(list.to.return)
-}
-
-#randomized data
-random.data = as.matrix(apply(tl,2,sample))
-
-#Correlation using Kendall rank correlation
-
-cor.kendall = cor(tl, method = "kendall")
-cor.spearman = cor(tl, method = "spearman")
-
-cor.kendall = cor(mtl, method = "kendall")
-cor.spearman = cor(mtl, method = "spearman")
-
-#Linnarson Data
-load('linnarson_counts_librarynormalized.Robj')
-load('linnarson_metadata.Robj')
-
-cell_types = as.vector(unique(linnarson_metadata$level1class))
-mouse_avgGene = data.frame(matrix(NA, nrow = nrow(linnarson_counts), ncol = length(cell_types)))
-rownames(mouse_avgGene) = rownames(linnarson_counts)
-for (i in 1:length(cell_types)) {
-     avgGene = linnarson_counts[,linnarson_metadata$level1class == cell_types[i]]
-     avgGene = apply(avgGene,1,mean)
-     avgGene = log(avgGene+1)
-     mouse_avgGene[,i] = avgGene
-     colnames(mouse_avgGene)[i] = paste("mouse", cell_types[i], sep="_")
-}
-
-#saved in mouse_celltypes_expression.txt
-
-#merge with lizard and turtle data
-
-mtl = merge(tl,mouse_avgGene, by='row.names', all=F)
-rownames(mtl) = mtl[,1]
-mtl = mtl[,2:ncol(mtl)]
-
-
-#plots
-library(corrplot)
-pdf('a.pdf')
-corrplot(cor.matrix,method='square')
-dev.off()
-
-# find all pairwise differentially expressed (DE) genes 
-
-good.clusters <- levels(turtle@ident)  # or simply the list of cluster IDs to compare
-combinations <- t(combn(good.clusters,2)) # find unique pairwise combinations of cluster IDs
-DEgenes <- vector()  
-# Unfortunately it gets stuck at multiple points when test.use="negbinom", that's why test.use="bimod"
-for (i in 1:nrow(combinations)){
-     diff.genes <-  FindMarkers(turtle, ident.1= combinations[i,1], ident.2=combinations[i,2], test.use="bimod", thresh.use=0.4, min.pct=0.1)
-     DEgenes = c(DEgenes,rownames(diff.genes))
-     DEgenes = unique(DEgenes)
-     rm(diff.genes)
-}
-
-liz.clusters <- levels(liz@ident)  
-liz.clusters = liz.clusters[!liz.clusters %in% c("i_pericytes")]
-liz.combinations <- t(combn(liz.clusters,2)) # find unique pairwise combinations of cluster IDs
-Lizard_DEgenes <- vector()  
-# Unfortunately it gets stuck at multiple points when test.use="negbinom", that's why test.use="bimod"
-for (i in 1:nrow(liz.combinations)){
-     diff.genes <-  FindMarkers(liz, ident.1= liz.combinations[i,1], ident.2=liz.combinations[i,2], test.use="bimod", thresh.use=0.4, min.pct=0.1)
-     Lizard_DEgenes = c(Lizard_DEgenes,rownames(diff.genes))
-     Lizard_DEgenes = unique(Lizard_DEgenes)
-     rm(diff.genes)
-}
-
-#rename DEgene lists with eggnog gene name
-
-liz_genes = eggnog_merge[eggnog_merge[,2] %in% Lizard_DEgenes,1] 
-tur_genes = eggnog_merge[eggnog_merge[,3] %in% DEgenes,1] 
-lt_gene_union = union(liz_genes,tur_genes)
-lt_gene_intersect = intersect(liz_genes,tur_genes)
-
-#correlation with DEgenes
-
-tl_union = tl[rownames(tl) %in% lt_gene_union,] 
-cor.union.kendall = cor(tl_union, method = "kendall")
-cor.union.pearson = cor(tl_union, method = "pearson")
-cor.union.spearman = cor(tl_union, method = "spearman")
-tl_intersect = tl[rownames(tl) %in% lt_gene_intersect,] 
-cor.intersect.kendall = cor(tl_intersect, method = "kendall")
-cor.intersect.pearson = cor(tl_intersect, method = "pearson")
-cor.intersect.spearman = cor(tl_intersect, method = "spearman")
-
-#only transcription factors
-
-tfs = read.table("Transcription_Factors_Human.txt", header=F, as.is =T)
-tfs = t(tfs)
-tl_tfs = tl[rownames(tl) %in% tfs,] 
-cor.tfs.kendall = cor(tl_tfs, method = "kendall")
-cor.tfs.pearson = cor(tl_tfs, method = "pearson")
-cor.tfs.spearman = cor(tl_tfs, method = "spearman")
-
-#binned rank
-
-tl_binned_rank = apply(tl,2,function(x) cut(x,breaks=500, label=F))
-cor.binned.kendall = cor(tl_binned_rank, method = "kendall")
-cor.binned.pearson = cor(tl_binned_rank, method = "pearson")
-cor.binned.spearman = cor(tl_binned_rank, method = "spearman")
-
-tl_binned_rank2 = apply(tl,2,function(x) cut(x,breaks=quantile(x,probs=seq(0,1, by=0.5), na.rm=TRUE), label=F, include.lowest=TRUE))
-cor.binned.pearson2 = cor(tl_binned_rank5, method = "pearson")
-cor.binned.spearman2 = cor(tl_binned_rank5, method = "spearman")
-
-
-#min, median, max difference in gene expression in genes that switched bins cut--> 2 breaks
-
-a_v_b_diff = abs(tl_binned_rank5[,1]-tl_binned_rank5[,17])
-a_v_b_expdiff = abs(tl[a_v_b_diff ==1,1] - tl[a_v_b_diff ==1,2])
-
-#rank in a bin (mean expression) in DE genes 
-
-a = apply(tl,1,median)
-a = cut(a, breaks=quantile(a, probs=seq(0,1, by=0.5), na.rm=TRUE), label=F, include.lowest=TRUE)
-tl_bin2 = tl[a==2,]
-cor.bin2.spearman = cor(tl_bin2,method='spearman')
-
-#PCA with turtle, project lizard (all genes, ranked)
-
-tl_ranked = apply(tl,2,function(x) rank(x,ties.method='max'))
-#ranked data, do not scale
-turtle_pca = prcomp(t(tl_ranked[,1:9]), scale. = F)
-lizard_pca_x = t(tl_ranked[,10:19]) %*% turtle_pca$rotation
-
-pdf('pca_ranked_lizardprojection.pdf')
-plot(turtle_pca$x[,1], turtle_pca$x[,2], xlab="PC1", ylab ="PC2", col ='blue')
-points(lizard_pca_x[,1], lizard_pca_x[,2], col='red')
-dev.off()
-
-pca_all = prcomp(t(tl_ranked),scale. = F)
-pdf('pca_ranked_plots.pdf')
-plot(pca_all$x[,1], pca_all$x[,2], xlab="PC1", ylab ="PC2", col ='blue')
-points(pca_all$x[10:19,1], pca_all$x[10:19,2], xlab="PC1", ylab ="PC2", col ='red')
-plot(pca_all$x[,3], pca_all$x[,4], xlab="PC3", ylab ="PC4", col ='blue')
-points(pca_all$x[10:19,3], pca_all$x[10:19,4], xlab="PC3", ylab ="PC4", col ='red')
-dev.off()
-
-library(Rtsne)
-#PCs4tsne = cbind(pca_all$x[,1], pca_all$x[,3:18])
-pca_all = prcomp(t(tl_ranked),scale. = F)
-PCs4tsne = pca_all$x[,2:18]
-tsne_all = Rtsne(PCs4tsne, pca=F, verbose=T, perplexity=3)
-
-library(ggplot2)
-to.plot = as.data.frame(tsne_all$Y)
-rownames(to.plot) = colnames(tl_ranked)
-to.plot$species = substr(rownames(to.plot),1,6)
-to.plot$cell_type = substr(rownames(to.plot),8,12)
-
-pdf('tsne_all_wopc1.pdf')
-ggplot(to.plot,aes(V1,V2)) + geom_point(aes(color=cell_type, size=2))
-ggplot(to.plot,aes(V1,V2)) + geom_point(aes(color=species, size=2))
-dev.off()
-
-tsne_all_wpc1 = Rtsne(pca_all$x[,1:18], pca=F, verbose=T, perplexity=3)
-to.plot = as.data.frame(tsne_all_wpc1$Y)
-
-#PCA/TSNE with mouse, lizard, turtle (all genes, ranked)
-
-mtl_ranked = apply(mtl,2,function(x) rank(x,ties.method='max'))
-mtl_pca = prcomp(t(mtl_ranked), scale. = F)
-
-pdf('mtl_pca_ranked_plots.pdf')
-plot(mtl_pca$x[,1], mtl_pca$x[,2], xlab="PC1", ylab ="PC2", col ='blue')
-points(mtl_pca$x[10:19,1], mtl_pca$x[10:19,2], xlab="PC1", ylab ="PC2", col ='red')
-points(mtl_pca$x[20:26,1], mtl_pca$x[20:26,2], xlab="PC1", ylab ="PC2", col ='black')
-plot(mtl_pca$x[,3], mtl_pca$x[,4], xlab="PC3", ylab ="PC4", col ='blue')
-points(mtl_pca$x[10:19,3], mtl_pca$x[10:19,4], xlab="PC1", ylab ="PC2", col ='red')
-points(mtl_pca$x[20:26,3], mtl_pca$x[20:26,4], xlab="PC1", ylab ="PC2", col ='black')
-dev.off()
-
-library(Rtsne)
-mtl_tsne = Rtsne(mtl_pca$x[,4:20], pca=F, verbose=T, perplexity=3)
-
-library(ggplot2)
-to.plot = as.data.frame(mtl_tsne$Y)
-rownames(to.plot) = colnames(mtl_ranked)
-to.plot$species = substr(rownames(to.plot),1,6)
-to.plot$cell_type = substr(rownames(to.plot),8,12)
-
-pdf('mtl_tsne.pdf')
-ggplot(to.plot,aes(V1,V2)) + geom_point(aes(color=cell_type, size=2))
-ggplot(to.plot,aes(V1,V2)) + geom_point(aes(color=species, size=2))
-dev.off()
-
-#try ComBat to remove species signal
-
-library(sva)
-batch.ids = as.factor(substr(colnames(mtl),1,6))
-mtlComBat = ComBat(dat=mtl, batch=batch.ids, par.prior=T, prior.plots=F) 
-
-mtlComBat_ranked = apply(mtlComBat,2,function(x) rank(x,ties.method='max'))
-mtlComBat_pca = prcomp(t(mtlComBat_ranked), scale. = F)
-
-pdf('mtl_pca_ranked_combat_plots.pdf')
-plot(mtlComBat_pca$x[,1], mtlComBat_pca$x[,2], xlab="PC1", ylab ="PC2", col ='blue')
-points(mtlComBat_pca$x[10:19,1], mtlComBat_pca$x[10:19,2], xlab="PC1", ylab ="PC2", col ='red')
-points(mtlComBat_pca$x[20:26,1], mtlComBat_pca$x[20:26,2], xlab="PC1", ylab ="PC2", col ='black')
-plot(mtlComBat_pca$x[,3], mtlComBat_pca$x[,4], xlab="PC3", ylab ="PC4", col ='blue')
-points(mtlComBat_pca$x[10:19,3], mtlComBat_pca$x[10:19,4], xlab="PC1", ylab ="PC2", col ='red')
-points(mtlComBat_pca$x[20:26,3], mtlComBat_pca$x[20:26,4], xlab="PC1", ylab ="PC2", col ='black')
-dev.off()
-
-mtlComBat_tsne = Rtsne(mtlComBat_pca$x[,1:18], pca=F, verbose=T, perplexity=3)
-
-to.plot = as.data.frame(mtlComBat_tsne$Y)
-rownames(to.plot) = colnames(mtlComBat_ranked)
-to.plot$species = substr(rownames(to.plot),1,6)
-to.plot$cell_type = substr(rownames(to.plot),8,12)
-
-pdf('mtl_tsne_ComBat.pdf')
-ggplot(to.plot,aes(V1,V2)) + geom_point(aes(color=cell_type, size=2))
-ggplot(to.plot,aes(V1,V2)) + geom_point(aes(color=species, size=2))
-dev.off()
-
-
-#ICA
-
-library('fastICA')
-tl_ica = fastICA(t(tl_ranked), n.comp=19, row.norm = FALSE)
-
-#shuffle from a gene across cells,
-#this might tell if overall rank across cells is similar (that is in general, tfs are lower than actin...)
-
-random.lizard.rows = t(apply(lizard_avgGene[,2:ncol(lizard_avgGene)],1,sample))
-random.lizard.rows = as.data.frame(random.lizard.rows)
-colnames(random.lizard.rows) = colnames(lizard_avgGene[2:ncol(lizard_avgGene)])
-random.lizard.rows$eggnog = lizard_avgGene$eggnog
-random.turtle.rows = t(apply(turtle_avgGene[,2:ncol(turtle_avgGene)],1,sample))
-random.turtle.rows = as.data.frame(random.turtle.rows)
-colnames(random.turtle.rows) = colnames(turtle_avgGene[2:ncol(turtle_avgGene)])
-random.turtle.rows$eggnog = turtle_avgGene$eggnog
-random.tl.rows = merge(random.turtle.rows,random.lizard.rows, by='eggnog', all=F)
-rownames(random.tl.rows) = random.tl.rows$eggnog
-random.tl.rows$eggnog = NULL
-
-cor.random.kendall = cor(random.tl.rows, method='kendall')
-cor.random.pearson = cor(random.tl.rows, method='pearson')
-cor.random.spearman = cor(random.tl.rows, method='spearman')
-
-
-#Old Scaling method
-
-#scale to max 
-
-#remove pericytes and WBCs
-gene_minimum = apply(lizard_avgGene[,2:9],1,min)
-lizard_avgGene_scaled = sweep(lizard_avgGene[,2:9],1,gene_minimum,"-")
-gene_maximum = apply(lizard_avgGene_scaled,1,max)
-lizard_avgGene_scaled = sweep(lizard_avgGene_scaled,1,gene_maximum,"/")
-rownames(lizard_avgGene_scaled) = lizard_avgGene$eggnog
-#remove rows with only NA's
-lizard_avgGene_scaled[is.na(lizard_avgGene_scaled)] = 0
-lizard_avgGene_scaled = lizard_avgGene_scaled[rowSums (lizard_avgGene_scaled)!=0, ] 
-rm(list=c("gene_minimum","gene_maximum"))
-
-gene_minimum = apply(turtle_avgGene[,2:9],1,min)
-turtle_avgGene_scaled = sweep(turtle_avgGene[,2:9],1,gene_minimum,"-")
-gene_maximum = apply(turtle_avgGene_scaled,1,max)
-turtle_avgGene_scaled = sweep(turtle_avgGene_scaled,1,gene_maximum,"/")
-rownames(turtle_avgGene_scaled) = turtle_avgGene$eggnog
-#remove rows with only NA's
-turtle_avgGene_scaled[is.na(turtle_avgGene_scaled)] = 0
-turtle_avgGene_scaled = turtle_avgGene_scaled[rowSums (turtle_avgGene_scaled)!=0, ] 
-rm(list=c("gene_minimum","gene_maximum"))
-
-#merge
-
-tl_scaled_types = merge(turtle_avgGene_scaled, lizard_avgGene_scaled, by = 'row.names', all=F)
-rownames(tl_scaled_types) = tl_scaled_types[,1]
-tl_scaled_types = tl_scaled_types[,2:ncol(tl_scaled_types)]
-
-#subset
-
-tl_scaled_union = tl_scaled_types[rownames(tl_scaled_types) %in% lt_gene_union,] 
-cor.scaled.union.pearson = cor(tl_scaled_union, method = "pearson")
-tl_scaled_intersect = tl_scaled_types[rownames(tl_scaled_types) %in% lt_gene_intersect,] 
-cor.scaled.intersect.pearson = cor(tl_scaled_intersect, method = "pearson")
-
-
-###################################
-# correlation plot scaled as fraction of sum  specificity of a gene (similar to Molnar Specificity Comparison)
-lizard_avgGene_fraction = lizard_avgGene[,2:9]
-rownames(lizard_avgGene_fraction) = lizard_avgGene$eggnog
-lizard_avgGene_fraction = lizard_avgGene_fraction[rowSums (lizard_avgGene_fraction)!=0,] 
-gene_sums = rowSums(lizard_avgGene_fraction)
-lizard_avgGene_fraction = sweep(lizard_avgGene_fraction,1,gene_sums,"/")
-rm(gene_sums)
-
-turtle_avgGene_fraction = turtle_avgGene[,2:9]
-rownames(turtle_avgGene_fraction) = turtle_avgGene$eggnog
-turtle_avgGene_fraction = turtle_avgGene_fraction[rowSums (turtle_avgGene_fraction)!=0,] 
-gene_sums = rowSums(turtle_avgGene_fraction)
-turtle_avgGene_fraction = sweep(turtle_avgGene_fraction,1,gene_sums,"/")
-rm(gene_sums)
-
-tl_fraction = merge(turtle_avgGene_fraction, lizard_avgGene_fraction, by = 'row.names', all=F)
-rownames(tl_fraction) = tl_fraction[,1]
-tl_fraction = tl_fraction[,2:ncol(tl_fraction)]
-
-#subset & correlation matrix
-
-tl_fraction_union = tl_fraction[rownames(tl_fraction) %in% lt_gene_union,]
-cor.fraction.union.pearson = cor(tl_fraction_union, method = "pearson")
-tl_fraction_intersect = tl_fraction[rownames(tl_fraction) %in% lt_gene_intersect,]
-cor.fraction.intersect.pearson = cor(tl_fraction_intersect, method = "pearson")
-
-#######################################################
-# correlation plot scaled to average expression of a gene
-lizard_avgGene_avgScaled = lizard_avgGene[,2:9]
-rownames(lizard_avgGene_avgScaled) = lizard_avgGene$eggnog
-lizard_avgGene_avgScaled = lizard_avgGene_avgScaled[rowSums (lizard_avgGene_avgScaled)!=0,]
-gene_avg = rowSums(lizard_avgGene_avgScaled)/ncol(lizard_avgGene_avgScaled)
-lizard_avgGene_avgScaled = sweep(lizard_avgGene_avgScaled,1,gene_avg,"/")
-rm(gene_avg)
-
-turtle_avgGene_avgScaled = turtle_avgGene[,2:9]
-rownames(turtle_avgGene_avgScaled) = turtle_avgGene$eggnog
-turtle_avgGene_avgScaled = turtle_avgGene_avgScaled[rowSums (turtle_avgGene_avgScaled)!=0,]
-gene_avg = rowSums(turtle_avgGene_avgScaled)/ncol(turtle_avgGene_avgScaled)
-turtle_avgGene_avgScaled = sweep(turtle_avgGene_avgScaled,1,gene_avg,"/")
-rm(gene_avg)
-
-tl_avgScaled = merge(turtle_avgGene_avgScaled, lizard_avgGene_avgScaled, by = 'row.names', all=F)
-rownames(tl_avgScaled) = tl_avgScaled[,1]
-tl_avgScaled = tl_avgScaled[,2:ncol(tl_avgScaled)]
-
-#subset & correlation matrix
-
-tl_avgScaled_union = tl_avgScaled[rownames(tl_avgScaled) %in% lt_gene_union,]
-cor.avgScaled.union.pearson = cor(tl_avgScaled_union, method = "pearson")
-tl_avgScaled_intersect = tl_avgScaled[rownames(tl_avgScaled) %in% lt_gene_intersect,]
-cor.avgScaled.intersect.pearson = cor(tl_avgScaled_intersect, method = "pearson")
-
-
-####################################################
-# correlation plot scaled as fraction of gene max
-lizard_avgGene_sc_max = lizard_avgGene[,2:9]
-rownames(lizard_avgGene_sc_max) = lizard_avgGene$eggnog
-lizard_avgGene_sc_max = lizard_avgGene_sc_max[rowSums (lizard_avgGene_sc_max)!=0,] 
-gene_max = apply(lizard_avgGene_sc_max,1,max)
-lizard_avgGene_sc_max = sweep(lizard_avgGene_fraction,1,gene_max,"/")
-rm(gene_max)
-
-turtle_avgGene_sc_max = turtle_avgGene[,2:9]
-rownames(turtle_avgGene_sc_max) = turtle_avgGene$eggnog
-turtle_avgGene_sc_max = turtle_avgGene_sc_max[rowSums (turtle_avgGene_sc_max)!=0,] 
-gene_max = apply(turtle_avgGene_sc_max,1,max)
-turtle_avgGene_sc_max = sweep(turtle_avgGene_sc_max,1,gene_sums,"/")
-rm(gene_max)
-
-tl_sc_max = merge(turtle_avgGene_sc_max, lizard_avgGene_sc_max, by = 'row.names', all=F)
-rownames(tl_sc_max) = tl_sc_max[,1]
-tl_sc_max = tl_sc_max[,2:ncol(tl_sc_max)]
-
-#subset & correlation matrix
-
-tl_sc_max_union = tl_sc_max[rownames(tl_sc_max) %in% lt_gene_union,] 
-cor.sc.max.union.pearson = cor(tl_sc_max_union, method = "pearson")
-tl_sc_max_intersect = tl_sc_max[rownames(tl_sc_max) %in% lt_gene_intersect,] 
-cor.sc.max.intersect.pearson = cor(tl_sc_max_intersect, method = "pearson")
-
-
-
-#correlation plot fraction
-
-cor.matrix = cor(tl_cell_types)
-cor.matrix.subset = cor.matrix[1:19,20:45]
-cor_min = apply(cor.matrix.subset,1,min)
-
-cor.matrix.subset.scaled = sweep(cor.matrix.subset,1,cor_min,"-")
-cor_max = apply(cor.matrix.subset.scaled,1,max)
-cor.matrix.subset.scaled = sweep(cor.matrix.subset.scaled,1,cor_max,"/")
-
-cor.matrix.spearman = cor(tl_cell_types, method='spearman')
-cor.matrix.spearman = cor.matrix.spearman[1:19,20:62]
-
-pdf('a.pdf')
-#TSNEPlot(excit,pt.size=0.7,do.label=T)
-TSNEPlot(lizExcit,pt.size=0.7,do.label=T)
-corrplot(cor.matrix.subset,method='square')
-corrplot(cor.matrix.subset.scaled,method='square')
-dev.off()
-
-#significance test
-
-cor.mtest <- function(mat, conf.level = 0.95){
-     mat <- as.matrix(mat)
-     n <- ncol(mat)
-     p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
-     diag(p.mat) <- 0
-     diag(lowCI.mat) <- diag(uppCI.mat) <- 1
-     for(i in 1:(n-1)){
-          for(j in (i+1):n){
-               tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
-               p.mat[i,j] <- p.mat[j,i] <- tmp$p.value
-               lowCI.mat[i,j] <- lowCI.mat[j,i] <- tmp$conf.int[1]
-               uppCI.mat[i,j] <- uppCI.mat[j,i] <- tmp$conf.int[2]
-          }
-     }
-     return(list(p.mat, lowCI.mat, uppCI.mat))
-}
-
-res1 <- cor.mtest(tl_cell_types,0.95)
-
-
-#test with only intersect in var gene lists
-var_gene_overlap = intersect(liz_var_gene,tur_var_gene)
-eggnog_overlap = merge(eggnog_lizard,eggnog_turtle,by = 'eggnog', all=F)
-eggnog_overlap = eggnog_overlap[eggnog_overlap[,1]%in% var_gene_overlap,]
-
-#subset avgGene tables
-
-l_avgGene_o = AverageExpression(lizExcit)
-colnames(l_avgGene_o) = paste("L",colnames(l_avgGene_o),sep='')
-l_avgGene_o$lizard = rownames(l_avgGene_o)
-l_avgGene_o = merge(eggnog_overlap, l_avgGene_o, by ="lizard", all = F)
-rownames(l_avgGene_o) = l_avgGene_o$eggnog
-l_avgGene_o = l_avgGene_o[,4:dim(l_avgGene_o)[2]]
-
-t_avgGene_o = AverageExpression(excit)
-colnames(t_avgGene_o) = paste("T",colnames(t_avgGene_o),sep='')
-t_avgGene_o$turtle = rownames(t_avgGene_o)
-t_avgGene_o = merge(eggnog_overlap, t_avgGene_o, by ="turtle", all = F)
-rownames(t_avgGene_o) = t_avgGene_o$eggnog
-t_avgGene_o = t_avgGene_o[,4:dim(t_avgGene_o)[2]]
-
-#scale
-
-gene_minimum = apply(l_avgGene_o,1,min)
-l_avgGene_o = sweep(l_avgGene_o,1,gene_minimum,"-")
-gene_maximum = apply(l_avgGene_o,1,max)
-l_avgGene_o = sweep(l_avgGene_o,1,gene_maximum,"/")
-
-gene_minimum = apply(t_avgGene_o,1,min)
-t_avgGene_o = sweep(t_avgGene_o,1,gene_minimum,"-")
-gene_maximum = apply(t_avgGene_o,1,max)
-t_avgGene_o = sweep(t_avgGene_o,1,gene_maximum,"/")
-
-#merge
-
-tl_o = merge(l_avgGene_o,t_avgGene_o, by = 'row.names', all=F)
-rownames(tl_o) = tl_o[,1]
-tl_o = tl_o[,2:dim(tl_o)[2]]
-
-#correlation plot scaled
-
-cor.matrix.o = cor(tl_o)
-cor.matrix.o.subset = cor.matrix.o[1:19,20:62]
-
-
-cor_min = apply(cor.matrix.o.subset,1,min)
-cor.matrix.o.subset.scaled = sweep(cor.matrix.o.subset,1,cor_min,"-")
-cor_max = apply(cor.matrix.o.subset.scaled,1,max)
-cor.matrix.o.subset.scaled = sweep(cor.matrix.o.subset.scaled,1,cor_max,"/")
-
-pdf('a.pdf')
-TSNEPlot(excit,pt.size=0.7,do.label=T)
-TSNEPlot(lizExcit,pt.size=0.7,do.label=T)
-corrplot(cor.matrix.o.subset,method='square')
-corrplot(cor.matrix.o.subset.scaled, method='square')
-dev.off()
-
-
-#maria's functions
-yao <- function(data) {
-     
-     gene_max = apply(data, 1, max)
-     gene_min = apply(data, 1, min)
-     med = median(gene_max)
-     s.data=data
-     
-     for (i in 1:nrow(data)) {
-          if (gene_max[i]>med){
-               s.data[i,] = sapply(data[i,],function(x)(x-gene_min[i])/gene_max[i])
-          } 
-          else {
-          s.data[i,] = sapply(data[i,],function(x)(x-gene_min[i])/med)
-          }
-     }
-     return(s.data)
-}
-
-
-#yao normalization
-
-load('170119_lizard_neurons_noRegression_svmFiltered_slowTSNE_1250gene_.Robj')
-avgGene = AverageExpression(lizExcit)
-colnames(avgGene) = paste("L",colnames(avgGene),sep='')
-avgGene = yao(avgGene)
-avgGene$lizard = rownames(avgGene)
-avgGene = merge(eggnog_merge, avgGene, by ="lizard", all = F)
-rownames(avgGene) = avgGene$eggnog
-avgGene = avgGene[,4:dim(avgGene)[2]]
-
-
-#t_avgGene = AverageExpression(excit)
-t_avgGene = read.table("170118_excit.merged_avg_expression.txt", header=T, stringsAsFactors=F)
-colnames(t_avgGene) = 1:29
-colnames(t_avgGene) = paste("T",colnames(t_avgGene),sep='')
-#remove clusters 1(small), 2(small), 22(doublets)
-t_avgGene = t_avgGene[,!colnames(t_avgGene) %in% c("T1", "T2", "T22")]
-t_avgGene = yao(t_avgGene)
-t_avgGene$turtle = rownames(t_avgGene)
-t_avgGene = merge(eggnog_merge, t_avgGene, by ="turtle", all = F)
-rownames(t_avgGene) = t_avgGene$eggnog
-t_avgGene = t_avgGene[,4:dim(t_avgGene)[2]]
-
-#merge
-
-tl_o = merge(avgGene,t_avgGene, by = 'row.names', all=F)
-rownames(tl_o) = tl_o[,1]
-tl_o = tl_o[,2:dim(tl_o)[2]]
-
-#correlation plot scaled
-
-cor.matrix.o = cor(tl_o)
-cor.matrix.o.subset = cor.matrix.o[1:19,20:45]
-
-
-cor_min = apply(cor.matrix.o.subset,1,min)
-cor.matrix.o.subset.scaled = sweep(cor.matrix.o.subset,1,cor_min,"-")
-cor_max = apply(cor.matrix.o.subset.scaled,1,max)
-cor.matrix.o.subset.scaled = sweep(cor.matrix.o.subset.scaled,1,cor_max,"/")
-
-pdf('a.pdf')
-TSNEPlot(excit,pt.size=0.7,do.label=T)
-TSNEPlot(lizExcit,pt.size=0.7,do.label=T)
-corrplot(cor.matrix.o.subset,method='square')
-corrplot(cor.matrix.o.subset.scaled, method='square')
-dev.off()
-
-corr.compare <-function(species1, species2, genes1, genes2, gene.set = "union", method.scale= "max", method.cor="spearman" ,normalize.cor = TRUE) {
-     GENE.SETS = c("union","intersection","genes1","genes2")
-     i.genes = pmatch(gene.set, GENE.SETS)
-     if (i.genes == 1){
-          gene.use = unique(c(genes1,genes2))
-     } 
-     if (i.genes == 2){
-          gene.use = intersect(genes1,genes2)
-     }
-     if (i.genes == 3){
-          gene.use = genes1
-          
-     }
-     
-     if (i.genes == 4){
-          
-          gene.use = genes2
-          
-     } 
-     
-     gene.use <- gene.use[gene.use %in% rownames(species1) & gene.use %in% rownames(species2)]
-     
-     species1.set <- species1[gene.use,]
-     
-     species2.set <- species2[gene.use,]
-     
-     
-     METHOD.SCALE = c("minmax","max","scale","yao","none","JSD")
-     
-     i.method.scale = pmatch(method.scale, METHOD.SCALE)
-     
-     if (i.method.scale == 1) {
-          
-          species1.norm<-as.matrix(t(apply(species1.set, 1, function(x)(x-min(x))/(max(x)-min(x)))))
-          
-          species2.norm<-as.matrix(t(apply(species2.set, 1, function(x)(x-min(x))/(max(x)-min(x)))))
-          
-     }
-     
-     if (i.method.scale == 2) {
-          
-          species1.norm<-as.matrix(t(apply(species1.set, 1, function(x)(x/max(x)))))
-          
-          species2.norm<-as.matrix(t(apply(species2.set, 1, function(x)(x/max(x)))))
-          
-     }
-     
-     if (i.method.scale == 3) {
-          
-          species1.norm<-as.matrix(t(apply(species1.set, 1, function(x)(scale(x)))))
-          
-          species2.norm<-as.matrix(t(apply(species2.set, 1, function(x)(scale(x)))))
-          
-          colnames(species1.norm)<-colnames(species1.set)
-          
-          colnames(species2.norm)<-colnames(species2.set)
-          
-     }
-     
-     if (i.method.scale == 4) {
-          
-          species1.norm<-yao(species1.set)
-          
-          species2.norm<-yao(species2.set)
-          
-          colnames(species1.norm)<-colnames(species1.set)
-          
-          colnames(species2.norm)<-colnames(species2.set)
-          
-     }
-     
-     if (i.method.scale == 5) {
-          
-          species1.norm<-species1.set
-          
-          species2.norm<-species2.set
-          
-     }
-     
-     
-     corrs<-cor(species1.norm,species2.norm,method=method.cor)
-     
-     if (! normalize.cor) {
-          
-          corrplot(corrs, order="original",tl.pos="lt", method="color", tl.col="black",cl.lim=c(min(corrs),max(corrs)), is.corr=F,tl.cex=0.7)
-          
-     } 
-     
-     if (normalize.cor) {
-          
-          corrs.norm<-as.matrix(t(apply(corrs, 1, function(x)(x-min(x))/(max(x)-min(x)))))
-          
-          corrs.norm<-as.matrix(t(apply(corrs.norm, 1, function(x)(x-0.5)/0.5)))
-          
-          corrplot(corrs.norm, order="original",tl.pos="lt", method="color", tl.col="black", is.corr=F,tl.cex=0.7)
-          
-     }
-     
-}
-
-#interneurons
-
-load(170201_lizard_neurons_RegressOutLiz_svmFiltered_slowTSNE_.Robj)
-Liz_int_avgGene = AverageExpression(lizNeuron)
-#select interneurons
-Liz_int_avgGene = Liz_int_avgGene[,colnames(Liz_int_avgGene) %in% c("11","15","19","18")]
-colnames(Liz_int_avgGene) = paste("L", colnames(Liz_int_avgGene), sep = "")
-#scale (yao)
-Liz_int_avgGene = yao(Liz_int_avgGene)
-Liz_int_avgGene$lizard = rownames(Liz_int_avgGene)
-Liz_int_avgGene = merge(eggnog_merge, Liz_int_avgGene, by ="lizard", all = F)
-rownames(Liz_int_avgGene) = Liz_int_avgGene$eggnog
-Liz_int_avgGene = Liz_int_avgGene[,4:dim(Liz_int_avgGene)[2]]
-
-DEgenes <- vector()  # list where each element will be the list of DE genes
-# Unfortunately it gets stuck at multiple points when test.use="negbinom", that's why test.use="bimod"
-for (i in 1:nrow(combinations)){
-     a = combinations[i,1]
-     b = combinations[i,2]
-     diff.genes <-  FindMarkers(lizInterneuron, ident.1= a, ident.2=b, test.use="bimod", thresh.use=0.4, min.pct=0.1)
-     DEgenes = c(DEgenes, rownames(diff.genes[diff.genes$p_val<0.05,]))
-     DEgenes = unique(DEgenes)
-     rm(diff.genes)
-}
-
-##########################################
-#hypergeometric distribution
-
-#hypergeometric distribution 1. identify marker genes
-#only use these loops when clusters.to.include is >3!!!
-
-#lizard...maker sure ident is set to cell type
-clusters.to.include = as.vector(levels(liz@ident)[1:8])  # or simply the list of cluster IDs to compare, it's maybe good to exclude bad clusters like doublets etc
-
-combinations = t(combn(clusters.to.include,2)) # find unique pairwise combinations of cluster IDs
-colnames(combinations) <- c("ident.1","ident.2")
-
-liz_DEgene_list <- list()  # list where each element will be the list of DE genes
-# Unfortunately it gets stuck at multiple points when test.use="negbinom", that's why test.use="bimod"
-for (i in 1:nrow(combinations)){
-     a = combinations[i,1]
-     b = combinations[i,2]
-     diff.genes <-  FindMarkers(liz, ident.1= combinations[i,1], ident.2=combinations[i,2], test.use="bimod", thresh.use=0.4, min.pct=0.1)
-     liz_DEgene_list[[paste("id1",a,"id2",b, sep="_")]] = rownames(diff.genes[diff.genes$p_val<0.05 & diff.genes$avg_diff>0,])
-     liz_DEgene_list[[paste("id1",b,"id2",a, sep="_")]] = rownames(diff.genes[diff.genes$p_val<0.05 & diff.genes$avg_diff<0,])
-     rm(diff.genes)
-}
-
-
-#marker_union = Enriched Gene/Marker from a cluster against at least one other cluster, includes markers that define multiple cell-types
-#marker_intersect = enriched gene/marker from a cluster against all other clusters, markers that uniquely define a cell-type
-
-liz_marker_union = list()
-liz_marker_intersect = list()
-for (i in 1:length(clusters.to.include)) {
-     sub.list = liz_DEgene_list[grep(paste('id1',clusters.to.include[i], sep="_"),names(liz_DEgene_list))]
-     gene.union = vector()
-     for (j in 1:length(sub.list)){
-          gene.union = union(gene.union,sub.list[[j]])
-     }
-     liz_marker_union[[clusters.to.include[i]]] = gene.union
-     gene.intersect = intersect(sub.list[[1]],sub.list[[2]])
-     for (j in 3:length(sub.list)){
-          gene.intersect = intersect(gene.intersect,sub.list[[j]])
-     }
-     liz_marker_intersect[[clusters.to.include[i]]] = gene.intersect
-     rm(list=c("sub.list","gene.union","gene.intersect"))
-}
- 
-#turtle...maker sure ident is set to cell type
-turtle.clusters.to.include = as.vector(levels(turtle@ident)[1:8])  
-
-combinations = t(combn(turtle.clusters.to.include,2)) # find unique pairwise combinations of cluster IDs
-colnames(combinations) <- c("ident.1","ident.2")
-
-turtle_DEgene_list <- list()  # list where each element will be the list of DE genes
-# Unfortunately it gets stuck at multiple points when test.use="negbinom", that's why test.use="bimod"
-for (i in 1:nrow(combinations)){
-     a = combinations[i,1]
-     b = combinations[i,2]
-     diff.genes <-  FindMarkers(turtle, ident.1= combinations[i,1], ident.2=combinations[i,2], test.use="bimod", thresh.use=0.4, min.pct=0.1)
-     turtle_DEgene_list[[paste("id1",a,"id2",b, sep="_")]] = rownames(diff.genes[diff.genes$p_val<0.05 & diff.genes$avg_diff>0,])
-     turtle_DEgene_list[[paste("id1",b,"id2",a, sep="_")]] = rownames(diff.genes[diff.genes$p_val<0.05 & diff.genes$avg_diff<0,])
-     rm(diff.genes)
-}
-
-turtle_marker_union = list()
-turtle_marker_intersect = list()
-for (i in 1:length(turtle.clusters.to.include)) {
-     sub.list = turtle_DEgene_list[grep(paste('id1',turtle.clusters.to.include[i], sep="_"),names(turtle_DEgene_list))]
-     gene.union = vector()
-     for (j in 1:length(sub.list)){
-          gene.union = union(gene.union,sub.list[[j]])
-     }
-     turtle_marker_union[[turtle.clusters.to.include[i]]] = gene.union
-     gene.intersect = intersect(sub.list[[1]],sub.list[[2]])
-     for (j in 3:length(sub.list)){
-          gene.intersect = intersect(gene.intersect,sub.list[[j]])
-     }
-     turtle_marker_intersect[[turtle.clusters.to.include[i]]] = gene.intersect
-     rm(list=c("sub.list","gene.union","gene.intersect"))
-}
-
-#replace with eggnog names
-
-eggnog_list_replace = function(genelist_list, species = c('turtle', 'lizard')){
-     for (i in 1:length(genelist_list)) {
-          genelist_list[[i]] = eggnog_merge[eggnog_merge[,grep(species,colnames(eggnog_merge))] %in% genelist_list[[i]],1] 
-     }  
-     return(genelist_list)
-}
-
-liz_marker_union_eggnog = eggnog_list_replace(liz_marker_union, species = 'lizard')
-liz_marker_intersect_eggnog = eggnog_list_replace(liz_marker_intersect, species = 'lizard')
-turtle_marker_union_eggnog = eggnog_list_replace(turtle_marker_union, species = 'turtle')
-turtle_marker_intersect_eggnog = eggnog_list_replace(turtle_marker_intersect, species = 'turtle')
-
-#total number of genes
-#loops only work for nClusters >3
-
-nGenes_union = vector()
-for(i in 1:length(liz_marker_union_eggnog)){
-     nGenes_union = union(nGenes_union,liz_marker_union_eggnog[[i]])
-}
-for(i in 1:length(turtle_marker_union_eggnog)){
-     nGenes_union = union(nGenes_union,turtle_marker_union_eggnog[[i]])
-}
-nGenes_union = length(nGenes_union)
-
-#union of the list of intersects
-
-nGenes_intersect = vector()
-for(i in 1:length(liz_marker_intersect_eggnog)){
-     nGenes_intersect = union(nGenes_intersect,liz_marker_intersect_eggnog[[i]])
-}
-for(i in 1:length(turtle_marker_intersect_eggnog)){
-     nGenes_intersect = union(nGenes_intersect,turtle_marker_intersect_eggnog[[i]])
-}
-nGenes_intersect = length(nGenes_intersect)
-
-#phyper(q, m, n, k, lower.tail = TRUE, log.p = FALSE) 
-#q = # of genes in intersect of sp1 and sp2 markers
-#m = # of markers in sp1 (x)
-#n = total pool of genes (in this case all potential markers) - # of markers in sp1 (m) (x)
-#k = # of markers in sp2 (x)
-hygecdf_matrix = function(eggnog.list1, eggnog.list2, sp1='t', sp2='l'){
-     names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
-     names(eggnog.list2) = paste(sp2,names(eggnog.list1), sep="_")
-     gene.list = c(eggnog.list1, eggnog.list2)
-     
-     hyg = matrix(nrow=length(gene.list),ncol=length(gene.list))
-     colnames(hyg) = c(names(gene.list))
-     rownames(hyg) = c(names(gene.list))
-     
-     JSC = matrix(nrow=length(gene.list),ncol=length(gene.list))
-     colnames(JSC) = c(names(gene.list))
-     rownames(JSC) = c(names(gene.list))
-     
-     nGenes = vector()
-     for(i in 1:length(gene.list)){
-          nGenes = union(nGenes,gene.list[[i]])
-     }
-     nGenes = length(nGenes)
-     #nGenes = 10089 #number of turtle v lizard eggnog genes
-     for (i in 1:length(gene.list)){
-          m = length(gene.list[[i]])
-          n = nGenes - m
-          n4BHcorrect = nrow(hyg)*ncol(hyg)
-          for (j in 1:length(gene.list)){
-               k = length(gene.list[[i]])
-               q = length(intersect(gene.list[[i]],gene.list[[j]]))
-               p.value = phyper(q,m,n,k, lower.tail=F)
-               #BH correct p values in hyg
-               p.value = p.adjust(p.value, method='BH',n=n4BHcorrect)
-               hyg[i,j] = p.value
-               u = length(union(gene.list[[i]],gene.list[[j]]))
-               JSC[i,j] = q/u
-               rm(list=c('k','q','u'))
-          }
-          rm(list=c('m','n'))
-     }
-     negLog10 = -log10(hyg)
-     return.list = list(hyg,negLog10,JSC)
-     names(return.list) = c('hypergeometric_distribution','negative_log10','JSC')
-     return(return.list)
-}
-
-tl_intersect_hygecdf2 = hygecdf_matrix(turtle_marker_intersect_eggnog,liz_marker_intersect_eggnog)
-tl_union_hygecdf2 = hygecdf_matrix(turtle_marker_union_eggnog,liz_marker_union_eggnog)
-
-library(ggplot2)
-library(reshape2)
-
-melted <- melt(tl_intersect_hygecdf[[1]]>0.05)
-pdf('hygecdf_tl_intersectMarkers.pdf')
-ggplot(melted, aes(x = Var2, y = Var1, fill = value)) + geom_tile() + scale_fill_manual(values = c("blue", "white")) + theme_bw() + theme(legend.position = "none")
-dev.off()
-
-melted <- melt(tl_union_hygecdf[[1]]>0.05)
-pdf('hygecdf_tl_unionMarkers.pdf')
-ggplot(melted, aes(x = Var2, y = Var1, fill = value)) + geom_tile() + scale_fill_manual(values = c("blue", "white")) + theme_bw() + theme(legend.position = "none")
-dev.off()
-
-
-melted = melt(tl_union_hygecdf[[2]])
-ggplot(melted, aes(x = Var1, y = Var2, fill = value)) + geom_tile()  
-ggplot(nba.m, aes(variable, Name)) + geom_tile(aes(fill = rescale), colour = "white") + scale_fill_gradient(low = "white", high = "blue"))
-
-#FIT KNN classifier
-#use ranked data
-#tl_ranked[,1:9] are turtle cell-types
-#tl_ranked[,10:19] are lizard cell-types (from above)
-
-#knn(train, test, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE)
-library(class)
-tl_knn_pred = knn(train=t(tl_ranked[,1:9]), test = t(tl_ranked[,10:19]), cl=colnames(tl_ranked)[1:9])
-library(rpart)
-t_tl_ranked = t(tl_ranked)
-t_tl_ranked = as.data.frame(t_tl_ranked)
-rpart(t_tl_ranked[1:9,] ~ .,method='class',data=t_tl_ranked[1:9,])
-
-
-#Specificity.Index = function(AvgGeneTable){
-   gene.max = apply(AvgGeneTable,1,max)
-   sp = sweep(AvgGeneTable,1,gene.max,"/")
-   sp = (1-sp)
-   sp = rowSums(sp)
-   sp = sp/(ncol(AvgGeneTable)-1)
-   return(sp)
-}
diff --git a/Comparative_analysis/eggnog_mastertable_generation.R b/Comparative_analysis/eggnog_mastertable_generation.R
deleted file mode 100644
index a9acc30..0000000
--- a/Comparative_analysis/eggnog_mastertable_generation.R
+++ /dev/null
@@ -1,125 +0,0 @@
-#####commands to process eggnog annotation files using awk in UNIX/LINUX#######
-
-#awk -F"\t" 'OFS="\t" {if($5) print substr($1, 18, length($1)-1),$5}' cpi.fasta.emapper.annotations > listEggNOG_cpi_OneToOne_20Mar2017.txt
-#awk -F"\t" 'OFS="\t" {if($5) print substr($1, 1, length($1)-2),$5}' Gallus_gallus-5.0.pep.all.fa.emapper.annotations > listEggNOG_Gallus_gallus_5_OneToOne_20Mar2017.txt
-#awk -F"\t" 'OFS="\t" {if($5) print substr($1, 1, length($1)-2),$5}' Homo_sapiens.GRCh38.pep.all-1.fa.emapper.annotations > listEggNOG_Homo_sapiens_OneToOne_20Mar2017_1.txt
-#awk -F"\t" 'OFS="\t" {if($5) print substr($1, 1, length($1)-2),$5}' Homo_sapiens.GRCh38.pep.all-2.fa.emapper.annotations > listEggNOG_Homo_sapiens_OneToOne_20Mar2017_2.txt
-#cat listEggNOG_Homo_sapiens_OneToOne_20Mar2017_1.txt listEggNOG_Homo_sapiens_OneToOne_20Mar2017_2.txt > listEggNOG_Homo_sapiens_OneToOne_20Mar2017.txt
-#awk -F"\t" 'OFS="\t" {if($5) print substr($1, 1, length($1)-2),$5}' Mus_musculus.GRCm38.pep.all.fa.emapper.annotations > listEggNOG_Mus_musculus_OneToOne_20Mar2017.txt
-#awk -F"\t" 'OFS="\t" {if($5) print substr($1, 16, length($1)-1),$5}' pvi.fasta.emapper-2.annotations > listEggNOG_pvit_OneToOne_20Mar2017_2.txt
-
-setwd("~/Documents/R/eggnog_results/")
-#####chicken#######
-gallus_geneIDs = read.table('ensmbl_gallus_geneIDs.txt', stringsAsFactors = F, header=F, sep = "\t")
-gallus_geneIDs = gallus_geneIDs[2:nrow(gallus_geneIDs),]
-colnames(gallus_geneIDs) = c("chicken",'protein')
-gallus_eggnog = read.table('listEggNOG_Gallus_gallus_5_OneToOne_20Mar2017.txt', header=F, stringsAsFactors = F, sep="\t")
-colnames(gallus_eggnog) = c("protein",'eggnog')
-gallus = merge(gallus_geneIDs,gallus_eggnog,by='protein', all=F)
-gallus = gallus[,2:3]
-gallus$cat = paste(gallus$chicken,gallus$eggnog,sep="_")
-gallus = gallus[!duplicated(gallus$cat),]
-gallus = gallus[,1:2]
-chicken_duplicates = unique(gallus[duplicated(gallus$chicken),1])
-eggnog_duplicates = unique(gallus[duplicated(gallus$eggnog),2])
-gallus = gallus[!gallus$chicken %in% chicken_duplicates,]
-gallus = gallus[!gallus$eggnog %in% eggnog_duplicates,]
-rm(list=c('gallus_eggnog','gallus_geneIDs','chicken_duplicates','eggnog_duplicates'))
-
-
-#######human########
-homo_geneIDs = read.table('ensmbl_homo_geneIDs.txt', stringsAsFactors = F, header=F, sep = "\t")
-homo_geneIDs = homo_geneIDs[2:nrow(homo_geneIDs),]
-colnames(homo_geneIDs) = c("human",'protein')
-homo_eggnog = read.table('listEggNOG_Homo_sapiens_OneToOne_20Mar2017.txt', header=F, stringsAsFactors = F, sep="\t")
-colnames(homo_eggnog) = c("protein",'eggnog')
-homo = merge(homo_geneIDs,homo_eggnog,by='protein', all=F)
-homo = homo[,2:3]
-homo$cat = paste(homo$human,homo$eggnog,sep="_")
-homo = homo[!duplicated(homo$cat),]
-homo = homo[,1:2]
-human_duplicates = unique(homo[duplicated(homo$human),1])
-eggnog_duplicates = unique(homo[duplicated(homo$eggnog),2])
-homo = homo[!homo$human %in% human_duplicates,]
-homo = homo[!homo$eggnog %in% eggnog_duplicates,]
-rm(list=c('homo_eggnog','homo_geneIDs','human_duplicates','eggnog_duplicates'))
-
-
-########mouse#######
-
-mus_geneIDs = read.table('ensmbl_mus_geneIDs.txt', stringsAsFactors = F, header=F, sep = "\t")
-mus_geneIDs = mus_geneIDs[2:nrow(mus_geneIDs),]
-colnames(mus_geneIDs) = c("mouse",'protein')
-mus_eggnog = read.table('listEggNOG_Mus_musculus_OneToOne_20Mar2017.txt', header=F, stringsAsFactors = F, sep="\t")
-colnames(mus_eggnog) = c("protein",'eggnog')
-mus = merge(mus_geneIDs,mus_eggnog,by='protein', all=F)
-mus = mus[,2:3]
-mus$cat = paste(mus$mouse,mus$eggnog,sep="_")
-mus = mus[!duplicated(mus$cat),]
-mus = mus[,1:2]
-mouse_duplicates = unique(mus[duplicated(mus$mouse),1])
-eggnog_duplicates = unique(mus[duplicated(mus$eggnog),2])
-mus = mus[!mus$mouse %in% mouse_duplicates,]
-mus = mus[!mus$eggnog %in% eggnog_duplicates,]
-rm(list=c('mus_eggnog','mus_geneIDs','mouse_duplicates','eggnog_duplicates'))
-
-############turtle###########
-chrysemys = read.table('listEggNOG_cpi_OneToOne_20Mar2017.txt', header=F, stringsAsFactors = F, sep="\t")
-colnames(chrysemys) = c("turtle",'eggnog')
-chrysemys$cat = paste(chrysemys$turtle,chrysemys$eggnog,sep="_")
-chrysemys = chrysemys[!duplicated(chrysemys$cat),]
-chrysemys = chrysemys[,1:2]
-turtle_duplicates = unique(chrysemys[duplicated(chrysemys$turtle),1])
-chrysemys = chrysemys[!chrysemys$turtle %in% turtle_duplicates,]
-
-#only want to remove LOC duplicates!!!!
-eggnog_duplicates = unique(chrysemys[duplicated(chrysemys$eggnog),2])
-chrysemys_LOC = chrysemys[grep("^LOC",chrysemys$turtle),]
-chrysemys_nonLOC = chrysemys[!chrysemys$turtle %in% chrysemys_LOC$turtle,]
-#remove eggnog_duplicates from LOC list
-chrysemys_LOC = chrysemys_LOC[!chrysemys_LOC$eggnog %in% eggnog_duplicates,]
-#remove nonLOC eggnog duplicates from nonLOC list
-eggnog_duplicates_nonLOC = unique(chrysemys_nonLOC[duplicated(chrysemys_nonLOC$eggnog),2])
-chrysemys_nonLOC = chrysemys_nonLOC[!chrysemys_nonLOC$eggnog %in% eggnog_duplicates_nonLOC,]
-chrysemys = rbind(chrysemys_nonLOC,chrysemys_LOC)
-rm(list=c('chrysemys_LOC','chrysemys_nonLOC','eggnog_duplicates','turtle_duplicates', 'eggnog_duplicates_nonLOC'))
-
-#############lizard###############
-pogona = read.table('listEggNOG_pvit_OneToOne_20Mar2017.txt', header=F, stringsAsFactors = F, sep="\t")
-colnames(pogona) = c("lizard",'eggnog')
-pogona$cat = paste(pogona$lizard,pogona$eggnog,sep="_")
-pogona = pogona[!duplicated(pogona$cat),]
-pogona = pogona[,1:2]
-lizard_duplicates = unique(pogona[duplicated(pogona$lizard),1])
-pogona = pogona[!pogona$lizard %in%lizard_duplicates,]
-
-#only want to remove LOC duplicates!!!!
-eggnog_duplicates = unique(pogona[duplicated(pogona$eggnog),2])
-pogona_LOC = pogona[grep("^LOC",pogona$lizard),]
-pogona_nonLOC = pogona[!pogona$lizard %in% pogona_LOC$lizard,]
-#remove eggnog_duplicates from LOC list
-pogona_LOC = pogona_LOC[!pogona_LOC$eggnog %in% eggnog_duplicates,]
-#remove nonLOC eggnog duplicates from nonLOC list
-eggnog_duplicates_nonLOC = unique(pogona_nonLOC[duplicated(pogona_nonLOC$eggnog),2])
-pogona_nonLOC = pogona_nonLOC[!pogona_nonLOC$eggnog %in% eggnog_duplicates_nonLOC,]
-pogona = rbind(pogona_nonLOC,pogona_LOC)
-rm(list=c('pogona_nonLOC','pogona_LOC','eggnog_duplicates','lizard_duplicates', 'eggnog_duplicates_nonLOC'))
-#add in SST
-pogona = rbind(c('SST','SST'),pogona)
-
-
-####Save tables######
-
-write.table(chrysemys,file="chrysemys_eggnog_pruned.txt",quote=F,sep='\t',row.names=F,col.names=T)
-write.table(gallus,file="gallus_eggnog_pruned.txt",quote=F,sep='\t',row.names=F,col.names=T)
-write.table(homo,file="homo_eggnog_pruned.txt",quote=F,sep='\t',row.names=F,col.names=T)
-write.table(mus,file="mus_eggnog_pruned.txt",quote=F,sep='\t',row.names=F,col.names=T)
-write.table(pogona,file="pogona_eggnog_pruned.txt",quote=F,sep='\t',row.names=F,col.names=T)
-
-#######merge#########
-
-eggnog_mastertable = merge(chrysemys,pogona,by='eggnog', all = F)
-eggnog_mastertable = merge(eggnog_mastertable,gallus, by='eggnog',all=F)
-eggnog_mastertable = merge(eggnog_mastertable,homo, by='eggnog',all=F)
-eggnog_mastertable = merge(eggnog_mastertable,mus, by='eggnog',all=F)
-write.table(eggnog_mastertable,file="eggnog_all_species.txt",quote=F,sep='\t',row.names=F,col.names=T)
diff --git a/Comparative_analysis/someplots.R b/Comparative_analysis/someplots.R
deleted file mode 100644
index 4697228..0000000
--- a/Comparative_analysis/someplots.R
+++ /dev/null
@@ -1,41 +0,0 @@
-#reorder matrix based on correlations within a species
-#corr.list is output from SpIntPermute function!!!
-
-ReorderCorrelation = function(corr.list,nCells_Sp1){
-     Sp1 = corr.list[[1]][1:nCells_Sp1,1:nCells_Sp1]
-     Sp2 = corr.list[[1]][(nCells_Sp1+1):nrow(corr.list[[1]]),(nCells_Sp1+1):nrow(corr.list[[1]])]
-     Sp1.hclust = hclust(dist(Sp1))
-     Sp1.order = Sp1.hclust$order
-     Sp2.hclust = hclust(dist(Sp2))
-     Sp2.order = Sp2.hclust$order
-     Sp2.order = nCells_Sp1 + Sp2.order
-     matrix.order = c(Sp1.order,Sp2.order)
-     
-     for (i in 1:3){
-          reorder.table = cbind(corr.list[[i]],matrix.order)
-          reorder.table = reorder.table[order(reorder.table[,ncol(reorder.table)]),]
-          reorder.table = reorder.table[,1:(ncol(reorder.table)-1)]
-          reorder.table = rbind(reorder.table,matrix.order)
-          reorder.table = reorder.table[,order(reorder.table[nrow(reorder.table),])]
-          reorder.table = reorder.table[1:(nrow(reorder.table)-1),]
-          corr.list[[i]] = reorder.table
-          rm(reorder.table)
-     }
-     return(corr.list)
-}
-
-#plot using Corrplot
-
-library(corrplot)
-
-corr.coeff.table = b[[1]][1:9,10:nrow(b[[1]])]
-p.matrix = b[[3]][1:9,10:nrow(b[[3]])]
-
-
-pdf('c.pdf')
-corr.coeff.table = corr.list[[1]]
-p.matrix = corr.list[[3]]
-
-bwr = colorRampPalette(c('darkblue','white','darkred'))
-corrplot(corr.coeff.table, order="original",tl.pos="lt", method="color", tl.col="black",cl.lim=c(min(corr.coeff.table),max(corr.coeff.table)), is.corr=F,tl.cex=0.7, sig.level=-0.05,insig="pch", pch=19, pch.cex=0.3,pch.col="black",p.mat=-p.matrix, col=bwr(100))
-dev.off()
diff --git a/Data_plots/TY_Plotting.R b/Data_plots/TY_Plotting.R
deleted file mode 100644
index 41e4d61..0000000
--- a/Data_plots/TY_Plotting.R
+++ /dev/null
@@ -1,150 +0,0 @@
-#######Violin Plots#######
-
-TYvPlot = function(seurat.object, gene,ident.use){
-     data.use=FetchData(seurat.object,gene)
-     data.use = data.frame(data.use)
-     data.use$identity = as.factor(seurat.object@data.info[,ident.use])
-     colnames(data.use) = c('expression','identity')
-     ggplot(data.use,aes(identity,expression)) + geom_violin(scale="width",adjust=1,trim=TRUE,aes(fill=identity)) + ylab(gene) +xlab(NULL) + theme(legend.position="none",axis.text.y = element_text(size=5),axis.title.y = element_text(size=10), axis.text.x = element_text(size=7))
-}
-
-TYvPlot_exp = function(seurat.object, gene,ident.use){
-     data.use= FetchData(seurat.object,gene)
-     data.use= expm1(data.use)
-     max.title = substr(as.character(max(data.use)),1,5)
-     data.use = data.use/max(data.use)
-     data.use = data.frame(data.use)
-     data.use$identity = as.factor(seurat.object@data.info[,ident.use])
-     colnames(data.use) = c('expression','identity')
-     ggplot(data.use,aes(identity,expression)) + geom_violin(scale="width",adjust=1,trim=TRUE,aes(fill=identity)) + labs(title = max.title, y = gene, x = NULL) + theme(legend.position="none",axis.text.y = element_blank(),axis.title.y = element_text(size=6), axis.ticks.x=element_blank(), axis.text.x = element_blank(),plot.title = element_text(size=4))
-}
-
-TYvPlot_exp_colors = function(seurat.object, gene,ident.use, colors.to.use){
-     data.use= FetchData(seurat.object,gene)
-     data.use= expm1(data.use)
-     max.title = substr(as.character(max(data.use)),1,5)
-     data.use = data.use/max(data.use)
-     data.use = data.frame(data.use)
-     data.use$identity = as.factor(seurat.object@data.info[,ident.use])
-     colnames(data.use) = c('expression','identity')
-     ggplot(data.use,aes(identity,expression)) + scale_fill_manual(values=colors.to.use) + geom_violin(scale="width",adjust=1,trim=TRUE,aes(fill=identity)) + labs(title = max.title, y = gene, x = NULL) + theme(legend.position="none",axis.text.y = element_blank(),axis.title.y = element_text(size=6), axis.ticks.x=element_blank(), axis.text.x = element_blank(),plot.title = element_text(size=4))
-}
-
-postscript("turtle_violin1.eps", horizontal = FALSE, onefile = FALSE, paper = "special",colormode="rgb", height = 4, width = 7)
-genes.to.plot = c('SYT1','SNAP25','CPLX1','SLC17A7','CAMK2A','GAD1','GAD2','SOX4','SOX11','GFAP','SOX9','AIF1','C1QC','OLIG1','OLIG2','PLP1','MBP','PDGFRA','DCN','LUM','IGJ',"MZB1")
-ident.use = 'cell_type'
-PlotList = lapply(genes.to.plot, function(x) TYvPlot_exp(turtle,x,ident.use)) 
-MultiPlotList(PlotList,cols = 3)
-dev.off()
-
-
-
-#plot subset of cells
-TYvPlot_exp = function(seurat.object, gene,ident.use, cell.classes.to.use, colors.to.use){
-     data.use= FetchData(seurat.object,gene)
-     data.use= expm1(data.use)
-     data.use = data.frame(data.use)
-     data.use$identity = as.factor(seurat.object@data.info[,ident.use])
-     data.use = data.use[data.use$identity %in% cell.classes.to.use,]
-     max.title = substr(as.character(max(data.use[,1])),1,5)
-     data.use[,1] = data.use[,1]/max(data.use[,1])
-     colnames(data.use) = c('expression','identity')
-     ggplot(data.use,aes(identity,expression)) + scale_fill_manual(values=colors.to.use) + geom_violin(scale="width",adjust=1,trim=TRUE,aes(fill=identity)) + labs(title = max.title, y = gene, x = NULL) + theme(legend.position="none",axis.text.y = element_blank(),axis.title.y = element_text(size=6), axis.ticks.x=element_blank(), axis.text.x = element_blank(),plot.title = element_text(size=4))
-}
-
-#example of code to plot
-postscript("turtle_MCDMC_violin.eps", horizontal = FALSE, onefile = FALSE, paper = "special",colormode="cmyk", height = 4, width = 7)
-genes.to.plot = c('ZBTB20','PROX1','PENK','LOC101935936','COCH','NTS','ID2','AVP','BDNF','NPB','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20','ZBTB20')
-ident.use = 'tree.ident'
-colors.to.use = gg_color_hue(8)
-cell.classes.to.use = as.numeric(levels(neur.merged.f@data.info$tree.ident)[2:9])
-PlotList = lapply(genes.to.plot, function(x) TYvPlot_exp(neur.merged.f,x,ident.use,cell.classes.to.use,colors.to.use)) 
-MultiPlotList(PlotList,cols = 3)
-dev.off()
-
-
-#Plotting correlations with significant correlations indicated with a dot
-library(corrplot)
-
-postscript("tm_all_compare1A.eps", horizontal = FALSE, onefile = FALSE, paper = "special",colormodel="rgb", height = 6, width = 6)
-corr.coeff.table = tm_all_compare1A[[1]][1:14,15:21]
-p.matrix = tm_all_compare1A[[3]][1:14,15:21]
-corrplot(corr.coeff.table, order="original",tl.pos="lt", method="color", tl.col="black",cl.lim=c(min(corr.coeff.table),max(corr.coeff.table)), is.corr=F,tl.cex=0.7, sig.level=-0.05,insig="pch", pch=19, pch.cex=0.3,pch.col="black",p.mat=-p.matrix, col=bwr(100))
-dev.off()
-
-
-
-#heatmap
-#plot as png to reduce size
-#plot with histogram, otherwise colors look odd...
-     
-png('lizard_heatmap.png', width=2000, height=2000)
-DoHeatmap(lizNeuron,genes.use=rownames(liz.markers),slim.col.label=T,order.by.ident=T,draw.line=F, col.use=colorRampPalette(c('black','white'))(50),remove.key=T)
-dev.off()
-     
-     
-     
-
-
-
-
-#######to replicate ggplot2 pallet#######
-
-gg_color_hue <- function(n) {
-     hues = seq(15, 375, length = n + 1)
-     hcl(h = hues, l = 65, c = 100)[1:n]
-}
-
-#use option colors.use = in Seurat TSNEPlot to change colors in plot
-     
-#alternate color scheme, no gray
-colorRampSet1 = colorRampPalette(brewer.pal(8,'Set1'))
-#alternate color scheme, no gray, no yellow
-set2 = brewer.pal(8,'Set1')
-set2 = set2[!(set2 %in% c("#FFFF33"))]
-colorRampSet2 = colorRampPalette(set2)
-#alternate color scheme, no gray, replace with darker yellow
-set3 = brewer.pal(8,'Set1')
-set3[6] = "#FEC611"
-colorRampSet3 = colorRampPalette(set3)
-     
-#heatmap scaled for subset of cells
-#Hack of Seurat DoHeatmap Function
-     
-DoScaledHeatmap = function(object,cells.use=NULL,genes.use=NULL,disp.min=-2.5,disp.max=2.5,draw.line=TRUE,do.return=FALSE,order.by.ident=TRUE,col.use=pyCols,slim.col.label=FALSE,group.by=NULL,cex.col=NULL,...) {
-            genes.use=ainb(genes.use,rownames(object@scale.data))
-            cells.use=ainb(cells.use,object@cell.names)
-            cells.ident=object@ident[cells.use]
-            if (!is.null(group.by)) cells.ident=factor(FetchData(object,group.by)[,1])
-            cells.ident=factor(cells.ident,labels = ainb(levels(cells.ident),cells.ident))
-            if (order.by.ident) {
-              cells.use=cells.use[order(cells.ident)]
-            }
-            data.use=as.matrix(object@data[genes.use,cells.use])
-            data.use = t(scale(t(data.use), center = T, scale = T))
-            data.use=minmax(data.use,min=disp.min,max=disp.max)
-                 
-            vline.use=NULL;
-            colsep.use=NULL
-            
-            if (draw.line) {
-              colsep.use=cumsum(table(cells.ident))
-            }
-            if(slim.col.label && order.by.ident) {
-              col.lab=rep("",length(cells.use))
-              col.lab[round(cumsum(table(cells.ident))-table(cells.ident)/2)+1]=levels(cells.ident)
-              if(is.null(cex.col)){cex.col = 0.2+1/log10(length(unique(cells.ident)))}
-              heatmap.2(data.use,Rowv=F,Colv=F,trace = "none",col=col.use,colsep = colsep.use,labCol=col.lab,cexCol=cex.col,...)
-            }
-            else if (slim.col.label){
-              col.lab=rep("",length(cells.use))
-              if(is.null(cex.col)){cex.col = 0.2+1/log10(length(unique(cells.ident)))}
-              heatmap.2(data.use,Rowv=F,Colv=F,trace = "none",col=col.use,colsep = colsep.use,labCol=col.lab,cexCol=cex.col,...)
-            }
-            else {
-              heatmap.2(data.use,Rowv=F,Colv=F,trace = "none",col=col.use,colsep = colsep.use,...)
-            }
-            if (do.return) {
-              return(data.use)
-            }
-          }
diff --git a/Gene_network_analysis/MeanVarData.R b/Gene_network_analysis/MeanVarData.R
new file mode 100644
index 0000000..bfa4b77
--- /dev/null
+++ b/Gene_network_analysis/MeanVarData.R
@@ -0,0 +1,101 @@
+
+# function adapted from MeanVarPlot (Seurat)
+# Collects results from the calculation of variable genes with different methods 
+# the DM function is from Kolodziejczyk et al Cell Stem Cell 2015
+
+# requires function quantile.trend (below) and
+
+library(zoo)
+
+
+MeanVarData <- function(data, num.bin=20) {
+	fxn.x = expMean
+	expMean=function(x) {
+  		return(log(mean(exp(x)-1)+1))
+		}
+	Mean = function(x) {
+		return(mean(exp(x)-1)+1)
+	}
+	expVar=function(x) {
+  		return(log(var(exp(x)-1)+1))
+		}
+	fxn.y = logVarDivMean
+	expCV2 <- function(x) {
+		return((sd(exp(x)-1)/mean(exp(x)-1))^2)
+		
+	} 
+	logVarDivMean=function(x) return(log(var(exp(x)-1)/mean(exp(x)-1)))
+	DM <- function (data, win.size=50){
+		mean = apply(data, 1, Mean)
+		cv2 = apply(data, 1, expCV2)
+		keep <- mean > 0 & !is.na(cv2) & cv2 > 0
+		mean.expr <- log10(mean[keep])
+    	cv2.expr <- log10(cv2[keep])
+		# mean.expr <- apply(data, 1, function(x) log10(expMean(x)))
+		# cv2.expr <- apply(data, 1, function(x) log10(expCV2(x)))
+		qloess <- quantile.trend(cv2.expr, mean.expr, quant=.5, size=win.size, overlap=.5)
+    	dm.out <- cv2.expr - approx(x=qloess$x, y=qloess$y, xout=mean.expr, rule=2)$y
+    	DM <- rep(NA_real_, length(keep))
+    	DM[keep]  <- dm.out
+    	names(DM) <- rownames(data)
+    	return(DM)		
+	}
+	data.x=rep(0,length(genes.use)); names(data.x)=genes.use; data.y=data.x; data.norm.y=data.x; data.cv2 = data.x; data.DM = data.x
+	bin.size <- 1000
+	genes.use <- rownames(data)
+               max.bin <- floor(length(genes.use)/bin.size) + 1
+                cat("Calculating gene dispersion", file = stderr())
+                pb <- txtProgressBar(min = 0, max = max.bin, style = 3, file = stderr())
+                for(i in 1:max.bin) {
+                  my.inds <- ((bin.size * (i - 1)):(bin.size * i - 1))+1
+                  my.inds <- my.inds[my.inds <= length(genes.use)]
+                  genes.iter=genes.use[my.inds]; data.iter=data[genes.iter,, drop = F]
+                  data.x[genes.iter]=apply(data.iter,1,fxn.x); data.y[genes.iter]=apply(data.iter,1,fxn.y)
+                  data.cv2[genes.iter] = apply(data.iter,1, expCV2)
+                  # data.DM[genes.iter] = apply(data.iter,1, DM)
+                  setTxtProgressBar(pb, i)  
+                }
+                close(pb)              
+                data.y[is.na(data.y)]=0
+                data.x[is.na(data.x)]=0               
+                data_x_bin=cut(data.x,num.bin)
+                names(data_x_bin)=names(data.x)
+                mean_y=tapply(data.y,data_x_bin,mean)
+                sd_y=tapply(data.y,data_x_bin,sd)
+                data.norm.y=(data.y-mean_y[as.numeric(data_x_bin)])/sd_y[as.numeric(data_x_bin)]               
+                data.norm.y[is.na(data.norm.y)]=0
+                names(data.norm.y)=names(data.x)  
+                data.DM = DM(data)       
+                mv.df=data.frame(data.x,data.y,data.norm.y, data.cv2, data.DM)
+                rownames(mv.df)=rownames(data)
+                return(mv.df)
+         }
+
+
+
+
+# function from Kolodziejczyk et al Cell Stem Cell 2015 and Aaron Lun
+
+quantile.trend <- function(y, x = NULL, nsplits=NULL, size=20, overlap=NULL, distance=NULL, quant=0.95, alignment="center", FUN=function(x) { quantile(x, quant) }, ...)
+{
+    if(!is.null(nsplits)) {
+        size <- ceiling(length(y)/nsplits)
+    }
+    if (is.null(distance)) { 
+        distance <- size
+    }
+    if (!is.null(overlap)) { 
+
+        distance <- size * (1-overlap)
+    }
+    if(is.null(x)) { 
+        x <- index(y)
+    } 
+    zoo.y <- zoo(x = y, order.by = x)
+    new.y <- rollapply(zoo.y, width = size, FUN=FUN, by=distance, align=alignment) 
+    new.x <- attributes(new.y)$index  
+    return(list(y = new.y, x = new.x))
+}
+
+
+
diff --git a/Gene_network_analysis/MeanVarData_function.R b/Gene_network_analysis/MeanVarData_function.R
deleted file mode 100644
index 2530b98..0000000
--- a/Gene_network_analysis/MeanVarData_function.R
+++ /dev/null
@@ -1,38 +0,0 @@
-MeanVarData <- function(data) {
-	num.bin=20
-	fxn.x = expMean
-	expMean=function(x) {
-  		return(log(mean(exp(x)-1)+1))
-		}
-	fxn.y = logVarDivMean
-	logVarDivMean=function(x) return(log(var(exp(x)-1)/mean(exp(x)-1)))
-	data.x=rep(0,length(genes.use)); names(data.x)=genes.use; data.y=data.x; data.norm.y=data.x;
-	bin.size <- 1000
-	genes.use <- rownames(data)
-               max.bin <- floor(length(genes.use)/bin.size) + 1
-                cat("Calculating gene dispersion", file = stderr())
-                pb <- txtProgressBar(min = 0, max = max.bin, style = 3, file = stderr())
-                for(i in 1:max.bin) {
-                  my.inds <- ((bin.size * (i - 1)):(bin.size * i - 1))+1
-                  my.inds <- my.inds[my.inds <= length(genes.use)]
-                  genes.iter=genes.use[my.inds]; 
-                  data.iter=data[genes.iter,, drop = F]
-                  data.x[genes.iter]=apply(data.iter,1,fxn.x); 
-                  data.y[genes.iter]=apply(data.iter,1,fxn.y) 
-                  setTxtProgressBar(pb, i)  
-                }
-                close(pb)
-                data.y[is.na(data.y)]=0
-                data.x[is.na(data.x)]=0               
-                data_x_bin=cut(data.x,num.bin)
-                names(data_x_bin)=names(data.x)
-                mean_y=tapply(data.y,data_x_bin,mean)
-                sd_y=tapply(data.y,data_x_bin,sd)
-                data.norm.y=(data.y-mean_y[as.numeric(data_x_bin)])/sd_y[as.numeric(data_x_bin)]               
-                data.norm.y[is.na(data.norm.y)]=0
-                names(data.norm.y)=names(data.x)               
-                mv.df=data.frame(data.x,data.y,data.norm.y)
-                rownames(mv.df)=rownames(data)
-                return(mv.df)
-         }
-         
diff --git a/Gene_network_analysis/comparative_wgcna.R b/Gene_network_analysis/comparative_wgcna.R
new file mode 100644
index 0000000..20e0094
--- /dev/null
+++ b/Gene_network_analysis/comparative_wgcna.R
@@ -0,0 +1,598 @@
+
+# comparative WGCNA - code adapted from WGCNA tutorials
+# comparison of mouse and turtle interneurons. 
+# the same pipeline was used to compare other cell types (see Supplementary Material for analysis parameters)
+
+library(WGCNA)
+
+load("./turtle.neurons.Robj")
+load("./tasic.Robj") # mouse data from Tasic et al 2016
+source("./MeanVarData.R")
+
+# requires files with gene functional annotations from eggnog
+
+
+##### prepare interneuron expression tables #####
+#################################################
+
+# take turtle interneuron expression data
+turtle_clusters <- c(paste0("i0",seq(7,9,by=1)),paste0("i", seq(10,18,by=1))) # select GABAergic clusters (MGE and CGE-derived)
+turtle_ids <- names(turtle.neurons@ident[turtle.neurons@ident %in% turtle_clusters])
+turtle_all.data <- turtle.neurons@data[,turtle_ids]
+turtle_all.data <- Matrix(turtle_all.data, sparse=F)
+turtle_all.data <- as.matrix(turtle_all.data)
+# 21167   368
+rownames(turtle_all.data) = toupper(rownames(turtle_all.data))
+
+# take mouse interneuron expression data from the Tasic dataset
+tasic_clusters <- c("f01","f02","f03","f04","f05","f06","f07","f08","f09","f10","f11","f12","f13","f14","f15","f16","f17","f18","f19","f20","f21","f22","f23")
+tasic_all.ids <- names(tasic@ident[tasic@ident %in% tasic_clusters])
+
+tasic_ids <- tasic_all.ids
+tasic_all.data <- tasic@data[,tasic_ids]
+tasic_all.data <- Matrix(tasic_all.data, sparse=F)
+tasic_all.data <- as.matrix(tasic_all.data)
+# 19625   761
+rownames(tasic_all.data) = toupper(rownames(tasic_all.data))
+
+
+# create pseudocells
+##################################################
+
+pseudocell.size = 5
+
+turtle_new_ids_list = list()
+for (i in 1:length(levels(turtle_ident))) {
+	cluster_id = levels(turtle_ident)[i]
+	cluster_cells <- names(turtle_ident[turtle_ident == cluster_id])
+	cluster_size <- length(cluster_cells)		
+	pseudo_ids <- floor(seq_along(cluster_cells)/pseudocell.size)
+	pseudo_ids <- paste0(cluster_id, "_", pseudo_ids)
+	names(pseudo_ids) <- sample(cluster_cells)	
+	turtle_new_ids_list[[i]] <- pseudo_ids		
+	}	
+turtle_new_ids <- unlist(turtle_new_ids_list)
+turtle_new_ids <- as.factor(turtle_new_ids)
+turtle_new_ids_length <- table(turtle_new_ids)
+turtle_short_ids <- names(turtle_new_ids_length[turtle_new_ids_length<3])
+turtle_all_ids <- levels(turtle_new_ids)  # 514
+turtle_good_ids <- turtle_all_ids[!turtle_all_ids %in% turtle_short_ids]  
+
+turtle_cells <- names(turtle_new_ids)[turtle_new_ids %in% turtle_good_ids]
+turtle_final_ids <- turtle_new_ids[turtle_cells]
+turtle_final_ids <- droplevels(turtle_final_ids)	 
+# 359 cells, 75 pseudocells, 1 with 3 cells, 14 with 4 cells and 60 with 5 cells 
+
+tasic_new_ids_list = list()
+for (i in 1:length(levels(tasic_ident))) {
+	cluster_id = levels(tasic_ident)[i]
+	cluster_cells <- names(tasic_ident[tasic_ident == cluster_id])
+	cluster_size <- length(cluster_cells)		
+	pseudo_ids <- floor(seq_along(cluster_cells)/pseudocell.size)
+	pseudo_ids <- paste0(cluster_id, "_", pseudo_ids)
+	names(pseudo_ids) <- sample(cluster_cells)	
+	tasic_new_ids_list[[i]] <- pseudo_ids		
+	}	
+tasic_new_ids <- unlist(tasic_new_ids_list)
+tasic_new_ids <- as.factor(tasic_new_ids)
+tasic_new_ids_length <- table(tasic_new_ids)
+tasic_short_ids <- names(tasic_new_ids_length[tasic_new_ids_length<3])
+tasic_all_ids <- levels(tasic_new_ids)  # 514
+tasic_good_ids <- tasic_all_ids[!tasic_all_ids %in% tasic_short_ids]  
+
+tasic_cells <- names(tasic_new_ids)[tasic_new_ids %in% tasic_good_ids]
+tasic_final_ids <- tasic_new_ids[tasic_cells]
+tasic_final_ids <- droplevels(tasic_final_ids)	 
+# 352 cells, 76 pseudocells, 28 with 4 cells and 48 with 5 cells 
+
+
+turtle_all.data <- turtle.neurons@data[,turtle_cells]
+turtle_all.data<-Matrix(turtle_all.data, sparse=F)
+turtle_all.data<-as.matrix(turtle_all.data)
+expMean=function(x) {
+  		return(log(mean(exp(x)-1)+1))
+		} 
+turtle_pdata = NULL
+turtle_data_t <- turtle_all.data
+for(i in levels(turtle_final_ids)) {
+              temp.cells= names(turtle_final_ids[turtle_final_ids == i])
+              if (length(temp.cells)==1) data.temp=(turtle_data_t[,temp.cells])
+              if (length(temp.cells)>1) data.temp=apply(turtle_data_t[,temp.cells],1,expMean)
+              turtle_pdata =cbind(turtle_pdata,data.temp)
+              colnames(turtle_pdata)[ncol(turtle_pdata)]=i
+            }
+turtle_all.data <- turtle_pdata
+# 21167    75
+
+tasic_all.data <- tasic@data[,tasic_cells]
+tasic_all.data<-Matrix(tasic_all.data, sparse=F)
+tasic_all.data<-as.matrix(tasic_all.data)
+expMean=function(x) {
+  		return(log(mean(exp(x)-1)+1))
+		} 
+tasic_pdata = NULL
+tasic_data_t <- tasic_all.data
+for(i in levels(tasic_final_ids)) {
+              temp.cells= names(tasic_final_ids[tasic_final_ids == i])
+              if (length(temp.cells)==1) data.temp=(tasic_data_t[,temp.cells])
+              if (length(temp.cells)>1) data.temp=apply(tasic_data_t[,temp.cells],1,expMean)
+              tasic_pdata =cbind(tasic_pdata,data.temp)
+              colnames(tasic_pdata)[ncol(tasic_pdata)]=i
+            }
+tasic_all.data <- tasic_pdata
+# 19625    76
+
+
+# find one-to-one orthologs, replace gene names with one-to-one orthologs names 
+###############################################################################
+
+rownames(turtle_all.data) <- toupper(rownames(turtle_all.data))
+rownames(tasic_all.data) <- toupper(rownames(tasic_all.data))
+
+eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
+eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
+eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
+eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
+genesSpecies1 = rownames(turtle_all.data)
+genesSpecies2 = rownames(tasic_all.data)
+colnames(eggnog1) = eggnog1[1,]
+eggnog1 = eggnog1[2:nrow(eggnog1),]
+colnames(eggnog2) = eggnog2[1,]
+eggnog2 = eggnog2[2:nrow(eggnog2),]
+genesSpecies1 = eggnog1[eggnog1[,1] %in% genesSpecies1,2]
+genesSpecies2 = eggnog2[eggnog2[,1] %in% genesSpecies2,2]
+genes = intersect(genesSpecies1,genesSpecies2)   #11672 genes
+
+genesSpecies1 = eggnog1[eggnog1[,2] %in% genes,1]
+genesSpecies2 = eggnog2[eggnog2[,2] %in% genes,1]
+turtle_all.data <- turtle_all.data[rownames(turtle_all.data) %in% genesSpecies1,]
+tasic_all.data <- tasic_all.data[rownames(tasic_all.data) %in% genesSpecies2,]
+
+eggnog1.f <- eggnog1[eggnog1[,1] %in% rownames(turtle_all.data),]
+turtle_all.data <- turtle_all.data[order(match(rownames(turtle_all.data),eggnog1.f[,1] )),]
+all.equal(eggnog1.f[,1], rownames(turtle_all.data))
+rownames(turtle_all.data) <- eggnog1.f[,2]
+
+eggnog2.f <- eggnog2[eggnog2[,1] %in% rownames(tasic_all.data),]
+tasic_all.data <- tasic_all.data[order(match(rownames(tasic_all.data),eggnog2.f[,1] )),]
+all.equal(eggnog2.f[,1], rownames(tasic_all.data))
+rownames(tasic_all.data) <- eggnog2.f[,2]
+
+turtle_all.data <- turtle_all.data[order(rownames(turtle_all.data), decreasing=F),]
+# 11672    75
+tasic_all.data <- tasic_all.data[order(rownames(tasic_all.data), decreasing=F),]
+# 11672    76
+
+
+
+# select genes with high variance in both datasets
+##################################################
+
+# remove non expressed genes.
+turtle_all.means <- rowMeans(turtle_all.data)
+tasic_all.means <- rowMeans(tasic_all.data)
+genes.keep <- intersect(names(turtle_all.means[turtle_all.means>0]), names(tasic_all.means[tasic_all.means>0]))
+# 10734 genes
+genes.keep <- genes.keep[order(genes.keep, decreasing=F)]
+turtle_all.data <- turtle_all.data[genes.keep,]
+tasic_all.data <- tasic_all.data[genes.keep,]
+
+genes.use <- genes.keep
+
+# find variable genes         
+turtle_meanvar <- MeanVarData(turtle_all.data)
+tasic_meanvar <- MeanVarData(tasic_all.data)
+ 
+# remove noisy genes
+turtle_noisy_genes <- NoisyGenes(neur, min.pct=0.4, clusters.use= turtle_clusters)
+tasic_noisy_genes <- NoisyGenes(tasic, min.pct=0.4, clusters.use= tasic_clusters)
+
+NoisyGenes1 = eggnog1[eggnog1[,2] %in% turtle_noisy_genes,1]
+NoisyGenes2 = eggnog2[eggnog2[,2] %in% toupper(tasic_noisy_genes),1]
+
+turtle_meanvar.f <- turtle_meanvar[!rownames(turtle_meanvar) %in% NoisyGenes1, ]   
+tasic_meanvar.f <- tasic_meanvar[!rownames(tasic_meanvar) %in% NoisyGenes2, ]    
+
+# order the filtered table of variable genes by the DM metric 
+turtle_meanvar.f <- turtle_meanvar.f[order(turtle_meanvar.f[,5], decreasing=T),] 
+tasic_meanvar.f <- tasic_meanvar.f[order(tasic_meanvar.f[,5], decreasing=T),] 
+
+turtle_top.genes <- rownames(turtle_meanvar.f)
+turtle_top.genes <- turtle_top.genes[!turtle_top.genes %in% NoisyGenes2]
+tasic_top.genes <- rownames(tasic_meanvar.f)
+tasic_top.genes <- tasic_top.genes[!tasic_top.genes %in% NoisyGenes1]
+
+# take the top 750 variable genes in each dataset
+final.genes <- unique(c(turtle_top.genes[1:750],  tasic_top.genes[1:750]))
+length(final.genes)
+# 1208 genes
+
+
+#################
+#### datasets ###
+#################
+ 
+turtle_data <- t(turtle_all.data[rownames(turtle_all.data) %in% final.genes,])
+tasic_data <- t(tasic_all.data[rownames(tasic_all.data) %in% final.genes,])
+
+
+##### WGCNA turtle #####
+########################
+
+Date = 170904
+name = "turtle_test18.1"
+SubGeneNames=colnames(turtle_data)
+
+powers = c(c(1:18), seq(from = 20, to=40, by=2))
+sft = pickSoftThreshold(turtle_data, powerVector = powers, verbose = 5,networkType = "signed")
+
+pdf(paste0(Date, "_",name, "_soft_threshold.pdf"), width=12)
+par(mfrow = c(1,2));
+cex1 = 0.9
+plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
+     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
+    main = paste("Scale independence"));
+text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
+    labels=powers,cex=cex1,col="red");
+# this line corresponds to using an R^2 cut-off of h
+abline(h=0.90,col="red")
+# Mean connectivity as a function of the soft-thresholding power
+plot(sft$fitIndices[,1], sft$fitIndices[,5],
+    xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
+    main = paste("Mean connectivity"))
+text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
+dev.off()
+
+turtle_softPower = 6
+NetworkType = "signed"
+turtle_adj= adjacency(turtle_data,type = NetworkType, power = turtle_softPower)
+turtle_k <- softConnectivity(turtle_data, type="signed", power=turtle_softPower)
+turtle_k <- turtle_k/max(turtle_k)
+names(turtle_k) <- colnames(turtle_data)
+quantile(turtle_k, probs=seq(0,1,by=0.1))
+ 
+
+##### WGCNA tasic ######
+########################
+
+Date = 170904
+name = "tasic_test18.1"
+SubGeneNames=colnames(tasic_data)
+
+powers = c(c(1:18), seq(from = 20, to=40, by=2))
+sft = pickSoftThreshold(tasic_data, powerVector = powers, verbose = 5,networkType = "signed")
+
+pdf(paste0(Date, "_",name, "_soft_threshold.pdf"), width=12)
+par(mfrow = c(1,2));
+cex1 = 0.9
+plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
+     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
+    main = paste("Scale independence"));
+text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
+    labels=powers,cex=cex1,col="red");
+# this line corresponds to using an R^2 cut-off of h
+abline(h=0.90,col="red")
+# Mean connectivity as a function of the soft-thresholding power
+plot(sft$fitIndices[,1], sft$fitIndices[,5],
+    xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
+    main = paste("Mean connectivity"))
+text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
+dev.off()
+
+tasic_softPower = 7
+NetworkType = "signed"
+tasic_adj= adjacency(tasic_data,type = NetworkType, power = tasic_softPower)
+tasic_k <- softConnectivity(tasic_data, type="signed", power = tasic_softPower)
+tasic_k <- tasic_k/max(tasic_k)
+names(tasic_k) <- colnames(tasic_data)
+quantile(tasic_k, probs=seq(0,1,by=0.1))
+
+
+######## find modules turtle ######
+###################################
+
+Date = 170904
+name = "turtle_test18.1"
+
+turtle_TOM=TOMsimilarityFromExpr(turtle_data,networkType = NetworkType, TOMType = "signed", power = turtle_softPower)
+colnames(turtle_TOM) =rownames(turtle_TOM) =SubGeneNames
+# save(TOM, file="turtle_all_highGenes_TOMsigned.Robj")
+turtle_dissTOM = 1-turtle_TOM
+turtle_geneTree = flashClust(as.dist(turtle_dissTOM),method="average")
+
+# pdf(paste0(Date,"_",test,"_dendrogram_",NetworkType,"_power", turtle_softPower,".pdf", sep=""), width=12)
+# plot(turtle_geneTree, xlab="", sub="",cex=0.3, labels=F)
+# dev.off()
+
+turtle_minModuleSize = 30
+turtle_dynamicMods = cutreeDynamic(dendro = turtle_geneTree, distM = turtle_dissTOM, method="hybrid", deepSplit = 4, pamRespectsDendro = FALSE, minClusterSize = turtle_minModuleSize, pamStage=T)
+turtle_dynamicColors = labels2colors(turtle_dynamicMods)
+table(turtle_dynamicColors)
+
+pdf(paste0(Date,"_",name,"_modules_",NetworkType, "_moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=12)
+plotDendroAndColors(turtle_geneTree, turtle_dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors")
+dev.off()
+
+# diag(turtle_dissTOM) = NA
+
+
+# module merging
+
+turtle_MEList = moduleEigengenes(turtle_data, colors = turtle_dynamicColors)
+turtle_MEs = turtle_MEList$eigengenes
+
+turtle_MEDiss = 1-cor(turtle_MEs);
+turtle_METree = hclust(as.dist(turtle_MEDiss), method = "average");
+
+pdf(paste0(Date,"_",name,"_hclust_modules_",NetworkType, "moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=12)
+plot(turtle_METree, main = "Clustering of module eigengenes",
+xlab = "", sub = "")
+dev.off()
+
+turtle_MEDissThres = 0.5
+# Call an automatic merging function
+turtle_merge = mergeCloseModules(turtle_data, turtle_dynamicColors, cutHeight = turtle_MEDissThres, verbose = 3)
+# The merged module colors
+turtle_mergedColors = turtle_merge$colors;
+# Eigengenes of the new merged modules:
+turtle_mergedMEs = turtle_merge$newMEs; 
+
+pdf(paste0(Date,"_",name,"_merged_modules_",NetworkType, "moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=12)
+plotDendroAndColors(turtle_geneTree, cbind(turtle_dynamicColors, turtle_mergedColors),
+c("Dynamic Tree Cut", "Merged dynamic"),
+dendroLabels = FALSE, hang = 0.03,
+addGuide = TRUE, guideHang = 0.05)
+dev.off()
+
+turtle_dynamicColors <- turtle_mergedColors
+
+
+######## find modules tasic ######
+###################################
+
+Date = 170904
+name = "tasic_test18.1"
+
+tasic_TOM=TOMsimilarityFromExpr(tasic_data,networkType = NetworkType, TOMType = "signed", power = tasic_softPower)
+colnames(tasic_TOM) =rownames(tasic_TOM) =SubGeneNames
+# save(TOM, file="turtle_all_highGenes_TOMsigned.Robj")
+tasic_dissTOM=1-tasic_TOM
+tasic_geneTree = flashClust(as.dist(tasic_dissTOM),method="average")
+
+# pdf(paste0(Date,"_",test,"_dendrogram_",NetworkType,"_power", softPower,".pdf", sep=""), width=12)
+# plot(tasic_geneTree, xlab="", sub="",cex=0.3, labels=F)
+# dev.off()
+
+tasic_minModuleSize = 30
+tasic_dynamicMods = cutreeDynamic(dendro = tasic_geneTree, distM = tasic_dissTOM, method="hybrid", deepSplit = 4, pamRespectsDendro = FALSE, minClusterSize = tasic_minModuleSize, pamStage=T)
+tasic_dynamicColors = labels2colors(tasic_dynamicMods)
+table(tasic_dynamicColors)
+
+pdf(paste0(Date,"_",name,"_modules_",NetworkType, "moduleSize",tasic_minModuleSize,"_power", tasic_softPower,".pdf", sep=""), width=12)
+plotDendroAndColors(tasic_geneTree, tasic_dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors")
+dev.off()
+
+# diag(tasic_dissTOM) = NA
+
+
+# module merging
+
+tasic_MEList = moduleEigengenes(tasic_data, colors = tasic_dynamicColors)
+tasic_MEs = tasic_MEList$eigengenes
+
+tasic_MEDiss = 1-cor(tasic_MEs);
+tasic_METree = hclust(as.dist(tasic_MEDiss), method = "average");
+
+pdf(paste0(Date,"_",name,"_hclust_modules_",NetworkType, "moduleSize",tasic_minModuleSize,"_power", tasic_softPower,".pdf", sep=""), width=12)
+plot(tasic_METree, main = "Clustering of module eigengenes",
+xlab = "", sub = "")
+dev.off()
+
+tasic_MEDissThres = 0.5
+# Call an automatic merging function
+tasic_merge = mergeCloseModules(tasic_data, tasic_dynamicColors, cutHeight = tasic_MEDissThres, verbose = 3)
+# The merged module colors
+tasic_mergedColors = tasic_merge$colors;
+# Eigengenes of the new merged modules:
+tasic_mergedMEs = tasic_merge$newMEs; 
+
+pdf(paste0(Date,"_",name,"_merged_modules_",NetworkType, "moduleSize",tasic_minModuleSize,"_power", tasic_softPower,".pdf", sep=""), width=12)
+plotDendroAndColors(tasic_geneTree, cbind(tasic_dynamicColors, tasic_mergedColors),
+c("Dynamic Tree Cut", "Merged dynamic"),
+dendroLabels = FALSE, hang = 0.03,
+addGuide = TRUE, guideHang = 0.05)
+dev.off()
+
+tasic_dynamicColors <- tasic_mergedColors
+
+
+######## module eigengenes turtle ######
+########################################
+
+Date = 170904
+name = "turtle_test18.1"
+
+turtle_MEList = moduleEigengenes(turtle_data, colors = turtle_dynamicColors)
+turtle_MEs = turtle_MEList$eigengenes
+
+turtle_MEeigengenes <- as.matrix(turtle_MEList$eigengenes)
+rownames(turtle_MEeigengenes) <- rownames(turtle_data)
+turtle_MEeigengenes <- turtle_MEeigengenes[order(match(rownames(turtle_MEeigengenes), names(turtle_ident))),]
+turtle_MEeigengenes <- turtle_MEeigengenes[,!colnames(turtle_MEeigengenes) %in% c("MEgrey")]
+
+turtle_tempident <- rownames(turtle_MEeigengenes)
+turtle_tempident <- sapply(turtle_tempident, function(x) substring(x,1,3))
+names(turtle_tempident) <- rownames(turtle_MEeigengenes)
+turtle_tempident <- as.factor(turtle_tempident)
+turtle_tempident <- factor(turtle_tempident, levels(turtle_tempident)[c(8:12,1:4,6,5,7)])
+turtle_tempident <- as.data.frame(turtle_tempident)
+turtle_tempident <- cbind(turtle_tempident, rownames(turtle_tempident))
+colnames(turtle_tempident) <- c("tempident","turtle_id")
+turtle_tempident <- arrange(turtle_tempident, tempident)
+turtle_tempident2 <- as.factor(turtle_tempident[,1])
+names(turtle_tempident2) <- turtle_tempident[,2]
+turtle_tempident <- turtle_tempident2
+
+
+n.clusters = length(table(turtle_tempident))
+plot_colors = colorRampPalette(brewer.pal(9, "Set1"))(n.clusters)
+turtle_plotColors <- mapvalues(turtle_tempident, from = levels(turtle_tempident), to = plot_colors) # plyr package
+turtle_plotColors <- as.character(turtle_plotColors)
+
+turtle_MEeigengenes[turtle_MEeigengenes>0.3] <- NA
+turtle_MEeigengenes <- turtle_MEeigengenes[order(match(rownames(turtle_MEeigengenes), names(turtle_tempident))),]
+turtle_MEeigengenes <- turtle_MEeigengenes[,c("MEbrown","MEturquoise","MEblue")]
+
+
+pdf(paste0(Date,"_",name,"_eigengenes_",NetworkType, "moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=13)
+par(oma=c(4,4,4,10))
+colors <- colorRampPalette(c("#9D8BFF","black", "#FFE000"))(n = 200)
+pal <- colorRampPalette(colors)
+heatmap.2(t(turtle_MEeigengenes), Colv=F, Rowv=F, trace="none",  col=pal, ColSideColors= turtle_plotColors,na.color="green")
+dev.off()
+
+# extract names of genes that belong to the modules
+SubGeneNames=colnames(turtle_data)
+turtle_module_colors= setdiff(unique(turtle_dynamicColors), "grey")
+for (color in turtle_module_colors){
+    module=SubGeneNames[which(turtle_dynamicColors==color)]
+    write.table(module, paste(Date, name, "_module_",color, ".txt",sep=""), sep="\t", row.names=FALSE, col.names=FALSE,quote=FALSE)    
+}
+
+
+
+######## module eigengenes tasic #######
+########################################
+
+Date = 170904
+name = "tasic_test18.1"
+
+tasic_MEList = moduleEigengenes(tasic_data, colors = tasic_dynamicColors)
+tasic_MEs = tasic_MEList$eigengenes
+tasic_MEeigengenes <- as.matrix(tasic_MEList$eigengenes)
+rownames(tasic_MEeigengenes) <- rownames(tasic_data)
+tasic_MEeigengenes <- tasic_MEeigengenes[order(match(rownames(tasic_MEeigengenes), names(tasic_ident))),]
+tasic_MEeigengenes <- tasic_MEeigengenes[,!colnames(tasic_MEeigengenes) %in% c("MEgrey")]
+
+
+tasic_tempident <- rownames(tasic_MEeigengenes)
+tasic_tempident <- sapply(tasic_tempident, function(x) substring(x,1,3))
+names(tasic_tempident) <- rownames(tasic_MEeigengenes)
+tasic_tempident <- as.factor(tasic_tempident)
+tasic_tempident <- factor(tasic_tempident, levels(tasic_tempident)[c(1:5, 10, 6,7,9,8,11,16,12:15,17:23)])
+tasic_tempident <- as.data.frame(tasic_tempident)
+tasic_tempident <- cbind(tasic_tempident, rownames(tasic_tempident))
+colnames(tasic_tempident) <- c("tempident","tasic_id")
+tasic_tempident <- arrange(tasic_tempident, tempident)
+tasic_tempident2 <- as.factor(tasic_tempident[,1])
+names(tasic_tempident2) <- tasic_tempident[,2]
+tasic_tempident <- tasic_tempident2
+
+
+n.clusters = length(table(tasic_tempident))
+plot_colors = colorRampPalette(brewer.pal(9, "Set1"))(n.clusters)
+tasic_plotColors <- mapvalues(tasic_tempident, from = levels(tasic_tempident), to = plot_colors) # plyr package
+tasic_plotColors <- as.character(tasic_plotColors)
+
+tasic_MEeigengenes[tasic_MEeigengenes>0.3] <- NA  
+tasic_MEeigengenes <- tasic_MEeigengenes[order(match(rownames(tasic_MEeigengenes), names(tasic_tempident))),]
+tasic_MEeigengenes <- tasic_MEeigengenes[,c("MEbrown","MEyellow","MEblue")]
+
+pdf(paste0(Date,"_",name,"_eigengenes_",NetworkType, "moduleSize",tasic_minModuleSize,"_power", tasic_softPower,".pdf", sep=""), width=13)
+par(oma=c(4,4,4,10))
+colors <- colorRampPalette(c("#9D8BFF","black", "#FFE000"))(n = 200)
+pal <- colorRampPalette(colors)
+heatmap.2(t(tasic_MEeigengenes), Colv=F, Rowv=F, trace="none",  col=pal, ColSideColors= tasic_plotColors, na.color="green")
+dev.off()
+
+# extract names of genes that belong to the modules
+SubGeneNames=colnames(tasic_data)
+tasic_module_colors= setdiff(unique(tasic_dynamicColors), "grey")
+for (color in tasic_module_colors){
+    module=SubGeneNames[which(tasic_dynamicColors==color)]
+    write.table(module, paste(Date, name, "_module_",color, ".txt",sep=""), sep="\t", row.names=FALSE, col.names=FALSE,quote=FALSE)    
+}
+
+
+
+
+######## compare modules fast ######
+####################################
+
+Date = 170904
+name = "test18"
+
+inNetwork = tasic_k> 0 | turtle_k> 0
+
+colorh = as.character(turtle_dynamicColors)
+colorhALL=rep("grey", length(turtle_k))
+colorhALL[inNetwork]=as.character(colorh)
+colorh2 =as.character(tasic_dynamicColors)
+colorh2ALL=rep("grey", length(tasic_k))
+colorh2ALL[inNetwork]=as.character(colorh2)
+colorTurtle = colorhALL;
+colortasic = colorh2ALL;
+
+nSets = 2
+multiExpr = list()
+multiExpr[[1]] = list(data=turtle_data)
+multiExpr[[2]] = list(data=tasic_data)
+setLabels = c("turtle","tasic")
+names(multiExpr) = setLabels
+
+colorList = list(colorTurtle, colortasic)
+names(colorList) = setLabels
+
+ref=1
+test=2
+
+dendrograms = list();
+for (set in 1:nSets)
+{
+  adj = abs(cor(multiExpr[[set]]$data[, inNetwork], use = "p"))^9;
+  dtom = TOMdist(adj);
+  dendrograms[[set]] = flashClust(as.dist(dtom), method = "a");
+}
+# Get eigengenes
+mes = list()
+for (set in 1:nSets)
+{
+   mes[[set]] = moduleEigengenes(multiExpr[[set]]$data, colorList[[ref]])$eigengenes
+}
+ 
+
+overlap = overlapTable(colorTurtle[inNetwork], colortasic[inNetwork]);
+
+# The numMat will encode color. We use -log of the p value.
+numMat = -log10(overlap$pTable);
+numMat[numMat >50] = 50;
+# Prepare for generating a color-coded plot of the overlap table. The text of the table will consist of
+# counts and corresponding p-values.
+textMat = paste(overlap$countTable, "\n", signif(overlap$pTable, 2));
+dim(textMat) = dim(numMat)
+# Additional information for the plot. These will be used shortly.
+xLabels = paste("M", sort(unique(colorTasic)));
+yLabels = paste("M", sort(unique(colorTurtle)));
+xSymbols = paste(sort(unique(colortasic)), ": ", table(colortasic[inNetwork]), sep = "")
+ySymbols = paste(sort(unique(colorTurtle)), ": ", table(colorTurtle[inNetwork]), sep = "")
+
+pdf(paste0(Date,"interneurons",name,"module_comparison.pdf", sep=""))
+# par(oma=c(10,10,10,10))
+fp=TRUE
+# Plot the overlap table
+fcex = 1.00;
+pcex = 1.0
+fcexl = 1.00;
+pcexl = 1.00;
+par(mar = c(6, 7, 2, 1.0));
+labeledHeatmap(Matrix = numMat,
+xLabels = xLabels, xSymbols = xSymbols,
+yLabels = yLabels, ySymbols = ySymbols,
+colorLabels = TRUE,
+colors = colorRampPalette(c("white","goldenrod1"))(50),
+textMatrix = textMat, cex.text = if (fp) fcex else pcex, setStdMargins = FALSE,
+cex.lab = if (fp) fcexl else pcexl,
+xColorWidth = 0.08,
+main = "Turtle modules (rows) vs. tasic modules (columns)", cex.main = 1.2);
+dev.off()
+
diff --git a/Gene_network_analysis/wgcna_turtle.R b/Gene_network_analysis/wgcna_turtle.R
new file mode 100644
index 0000000..f3d445c
--- /dev/null
+++ b/Gene_network_analysis/wgcna_turtle.R
@@ -0,0 +1,205 @@
+# WGCNA on turtle glutamatergic neurons - code adapted from WGCNA tutorials 
+# DM used as the metric to find variable genes 
+
+library(WGCNA)
+source("./MeanVarData.R")
+
+turtle_clusters <- c(paste0("e0",seq(1,9,by=1)),paste0("e", seq(10,38,by=1)))
+turtle_ids <- names(turtle.neurons@ident[turtle.neurons@ident %in% turtle_clusters])
+
+turtle_ident <- turtle.neurons@ident[turtle_ids]
+turtle_ident <- droplevels(turtle_ident)
+load("/storage/laur/Data_2/ToschesMaria/Single_cell_sequencing/Analysis/wgcna/turtle_only/170621_turtleOnly6_pseudocells_ids.Robj")
+new_ids_length <- table(new_ids)
+short_ids <- names(new_ids_length[new_ids_length<8])
+all_ids <- levels(new_ids)  # 514
+good_ids <- all_ids[!all_ids %in% short_ids]  # 490: 44 with 9 cells and 445 with 10 cells
+
+turtle_cells <- names(new_ids)[new_ids %in% good_ids]
+final_ids <- new_ids[turtle_cells]
+final_ids <- droplevels(final_ids)	#4854
+
+turtle_all.data <- turtle.neurons@data[,turtle_cells]
+turtle_all.data<-Matrix(turtle_all.data, sparse=F)
+turtle_all.data<-as.matrix(turtle_all.data)
+expMean=function(x) {
+  		return(log(mean(exp(x)-1)+1))
+		} 
+
+turtle_pdata = NULL
+turtle_data_t <- turtle_all.data
+for(i in levels(final_ids)) {
+              temp.cells= names(final_ids[final_ids == i])
+              if (length(temp.cells)==1) data.temp=(turtle_data_t[,temp.cells])
+              if (length(temp.cells)>1) data.temp=apply(turtle_data_t[,temp.cells],1,expMean)
+              turtle_pdata =cbind(turtle_pdata,data.temp)
+              colnames(turtle_pdata)[ncol(turtle_pdata)]=i
+            }
+
+
+turtle_all.data <- turtle_pdata
+# 21167   490
+
+
+# select genes with high variance 
+##################################################
+
+Date = 180122
+name = "turtleOnly7.1"
+SubGeneNames=colnames(turtle_data)
+
+# remove non expressed genes.
+genes.use <- rownames(turtle_all.data)
+turtle_meanvar <- MeanVarData(turtle_all.data)
+turtle_noisy_genes <- NoisyGenes(neur, min.pct=0.2, clusters.use= turtle_clusters)
+# find genes in the lower half (50% quantile) of the distribution of expression levels. These are lowly expressed genes and we do not want to carry them on in the analysis (expression range similar to the noisy genes)
+turtle.low <- rownames(turtle_meanvar)[which((turtle_meanvar[,1]<quantile(turtle_meanvar[,1])[[3]]) )]
+turtle_meanvar.f <- turtle_meanvar[!rownames(turtle_meanvar) %in% NoisyGenes1, ]  
+turtle_meanvar.f <- turtle_meanvar[which((turtle_meanvar[,1]>quantile(turtle_meanvar[,1])[[3]]) ), ]   
+turtle_meanvar.f <- turtle_meanvar.f[order(turtle_meanvar.f[,5], decreasing=T),] 
+
+turtle_top.genes <- rownames(turtle_meanvar.f) 
+final.genes <- rownames(turtle_meanvar.f[1:1500,])
+
+turtle_data <- t(turtle_all.data[final.genes,])
+
+
+##### WGCNA turtle #####
+########################
+ 
+Date = 180122
+name = "turtleOnly7.1"
+SubGeneNames=colnames(turtle_data)
+
+powers = c(c(1:18), seq(from = 20, to=40, by=2))
+sft = pickSoftThreshold(turtle_data, powerVector = powers, verbose = 5,networkType = "signed")
+
+pdf(paste0(Date, "_",name, "_soft_threshold.pdf"), width=12)
+par(mfrow = c(1,2));
+cex1 = 0.9
+plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
+     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
+    main = paste("Scale independence"));
+text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
+    labels=powers,cex=cex1,col="red");
+# this line corresponds to using an R^2 cut-off of h
+abline(h=0.90,col="red")
+# Mean connectivity as a function of the soft-thresholding power
+plot(sft$fitIndices[,1], sft$fitIndices[,5],
+    xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
+    main = paste("Mean connectivity"))
+text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
+dev.off()
+
+turtle_softPower = 10
+NetworkType = "signed"
+turtle_adj= adjacency(turtle_data,type = NetworkType, power = turtle_softPower ) #, corFnc="bicor", corOptions =  "maxPOutliers=0.1")
+turtle_k <- softConnectivity(turtle_data, type="signed", power=turtle_softPower ) #,corFnc="bicor", corOptions = list(maxPOutliers=0.1))
+turtle_k <- turtle_k/max(turtle_k)
+names(turtle_k) <- colnames(turtle_data)
+quantile(turtle_k, probs=seq(0,1,by=0.1))
+
+ 
+######## find modules turtle ######
+###################################
+
+turtle_TOM=TOMsimilarityFromExpr(turtle_data,networkType = NetworkType, TOMType = "signed", power = turtle_softPower ) #, corType="bicor", maxPOutliers = 0.1)  # 
+colnames(turtle_TOM) =rownames(turtle_TOM) =SubGeneNames
+# save(turtle_TOM, file=paste0(Date,"_",name,"TOMfile",".Robj", sep=""))
+turtle_dissTOM = 1-turtle_TOM
+turtle_geneTree = flashClust(as.dist(turtle_dissTOM),method="average")
+ 
+turtle_minModuleSize = 50
+turtle_dynamicMods = cutreeDynamic(dendro = turtle_geneTree, distM = turtle_dissTOM, method="hybrid", deepSplit = 4, pamRespectsDendro = FALSE, minClusterSize = turtle_minModuleSize, pamStage=T)
+turtle_dynamicColors = labels2colors(turtle_dynamicMods)
+table(turtle_dynamicColors)
+
+pdf(paste0(Date,"_",name,"_modules_",NetworkType, "moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=18)
+plotDendroAndColors(turtle_geneTree, turtle_dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors")
+dev.off()
+
+# module merging
+turtle_MEList = moduleEigengenes(turtle_data, colors = turtle_dynamicColors)
+turtle_MEs = turtle_MEList$eigengenes
+
+turtle_MEDiss = 1-cor(turtle_MEs);
+turtle_METree = hclust(as.dist(turtle_MEDiss), method = "average");
+
+pdf(paste0(Date,"_",name,"_hclust_modules2_",NetworkType, "moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=12)
+plot(turtle_METree, main = "Clustering of module eigengenes",
+xlab = "", sub = "")
+dev.off()
+
+# check plot to decide which modules to merge
+turtle_MEDissThres = 0.2
+# Call an automatic merging function
+turtle_merge = mergeCloseModules(turtle_data, turtle_dynamicColors, cutHeight = turtle_MEDissThres, verbose = 3)
+# The merged module colors
+turtle_mergedColors = turtle_merge$colors;
+# Eigengenes of the new merged modules:
+turtle_mergedMEs = turtle_merge$newMEs; 
+
+pdf(paste0(Date,"_",name,"_merged_modules_",NetworkType, "moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=40)
+plotDendroAndColors(turtle_geneTree, cbind(turtle_dynamicColors, turtle_mergedColors),
+c("Dynamic Tree Cut", "Merged dynamic"),
+dendroLabels = FALSE, hang = 0.03,
+addGuide = TRUE, guideHang = 0.05)
+dev.off()
+
+######## module eigengenes turtle ######
+########################################
+
+turtle_MEList = moduleEigengenes(turtle_data, colors = turtle_mergedColors)
+turtle_MEs = turtle_MEList$eigengenes
+
+turtle_MEeigengenes <- as.matrix(turtle_MEList$eigengenes)
+rownames(turtle_MEeigengenes) <- rownames(turtle_data)
+turtle_MEeigengenes <- turtle_MEeigengenes[order(match(rownames(turtle_MEeigengenes), names(turtle_ident))),]
+
+# remove small modules with unclear biological significance
+# turtle_MEeigengenes <- turtle_MEeigengenes[,!colnames(turtle_MEeigengenes) %in% c("MEgrey","MEmidnightblue","MEblack")]
+
+turtle_tempident <- rownames(turtle_MEeigengenes)
+turtle_tempident <- sapply(turtle_tempident, function(x) substring(x,1,3))
+names(turtle_tempident) <- rownames(turtle_MEeigengenes)
+turtle_tempident <- as.factor(turtle_tempident)
+
+extended_ids <- c("ADVR_e01","ADVR_e02", "PT_e03","PT_e04","PT_e05","PT_e06","aDC_e07","aDC_e08","aLC_e09","aLC_e10","aLC_e11","aLC_e12","aDC_e13","aDC_e14","aDC_e15","aDC_e16","pLC_e17","aDVR_e18","aDVR_e19","pDVR_e20","pDVR_e21","pDVR_e22","pDVR_e23","pDC_e24","pDVR_e25","pDC_e26","pDC_e27","pDC_e28","MC_e29","MC_e30","MC_e31","MC_e32","DMC_e33","DMC_e34","DMC_e35","DMC_e36","DMC_e37","DMC_e38")
+turtle_tempident <- mapvalues(turtle_tempident, from = levels(turtle_tempident), to = extended_ids)
+
+turtle_tempident <- factor(turtle_tempident, levels(turtle_tempident)[c(29,30,31,32,33,34,35,36,37,38,24,28,27,26,13,14,15,16,7,8,3,4,5,6,9,10,11,12,17,1,2,19,18,20,21,22,23,25)])
+turtle_tempident <- arrange(turtle_tempident, levels(turtle_tempident))
+turtle_tempident <- as.data.frame(turtle_tempident)
+turtle_tempident <- cbind(turtle_tempident, rownames(turtle_tempident))
+colnames(turtle_tempident) <- c("tempident","turtle_id")
+turtle_tempident <- arrange(turtle_tempident, tempident)
+turtle_tempident2 <- as.factor(turtle_tempident[,1])
+names(turtle_tempident2) <- turtle_tempident[,2]
+turtle_tempident <- turtle_tempident2
+
+n.clusters = length(table(turtle_tempident))
+plot_colors = colorRampPalette(brewer.pal(9, "Set1"))(n.clusters)
+plot_colors <- sample(plot_colors)
+turtle_plotColors <- mapvalues(turtle_tempident, from = levels(turtle_tempident), to = plot_colors) # plyr package
+turtle_plotColors <- as.character(turtle_plotColors)
+
+turtle_MEeigengenes2 <- turtle_MEeigengenes[order(match(rownames(turtle_MEeigengenes),names(turtle_tempident))),]
+
+turtle_MEeigengenes2 <- turtle_MEeigengenes2[, c("MEturquoise","MEyellow","MEred","MEbrown","MEgreen","MEblue")]
+
+
+pdf(paste0(Date,"_",name,"_eigengenes_",NetworkType, "moduleSize",turtle_minModuleSize,"_power", turtle_softPower,".pdf", sep=""), width=12)
+par(oma=c(4,4,4,10))
+colors <- colorRampPalette(c("#9D8BFF","black", "#FFE000"))(n = 200)
+pal <- colorRampPalette(colors)
+heatmap.2(t(turtle_MEeigengenes2), Colv=F,  Rowv=F, trace="none",  col=pal, ColSideColors= turtle_plotColors, dendogram="none", labCol=FALSE)
+dev.off()
+
+
+SubGeneNames=colnames(turtle_data)
+turtle_module_colors= setdiff(unique(turtle_mergedColors), "grey")
+for (color in turtle_module_colors){
+    module=SubGeneNames[which(turtle_mergedColors==color)]
+    write.table(module, paste(Date, name, "_module_",color, ".txt",sep=""), sep="\t", row.names=FALSE, col.names=FALSE,quote=FALSE)    
+}
+
diff --git a/README.md b/README.md
index 6b0a625..bf35aa6 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,9 @@ Analysis pipeline for the data reported in "Evolution of pallium, hippocampus an
 
   Contains scripts used to analyse turtle and lizard single-cell data. This code uses the R packages Seurat and MAST.
 
-* **Gene network analysis**
+* **Gene network analysis
 
-  Contains R scripts used to compare gene modules across species.
-
-* **Data plots**
-
-   Contains scripts used to plot turtle and lizard single-cell data. Code adapted from the Seurat package in R. 
+  Scripts used for WGCNA analysis. 
 
 * **Comparison of single-cell transcriptomes across species**