ComparativeAnalysis_CleanFunctions.R

#make sure these files are in your working directory:
#chrysemys_eggnog_pruned.txt
#pogona_eggnog_pruned.txt
#gallus_eggnog_pruned.txt
#homo_eggnog_pruned.txt
#mus_eggnog_pruned.txt


####################################METHOD1#####################
####################################Specificity Correlation#####
#scale to gene expression average

SpecificityCorrelation = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){

     #Step1:  Load eggnog tables
     if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species1=="mouse") {
     eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
     DEgenesSpecies1 = toupper(DEgenesSpecies1)
     rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
     }
     if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="mouse") {
          eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
          eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
          DEgenesSpecies2 = toupper(DEgenesSpecies2)
          rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
     }
     if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

     colnames(eggnog1) = eggnog1[1,]
     eggnog1 = eggnog1[2:nrow(eggnog1),]
     colnames(eggnog2) = eggnog2[1,]
     eggnog2 = eggnog2[2:nrow(eggnog2),]

     #Step2: Take union of DEgenes for analysis
     DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
     DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
     DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
     DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
     DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

     #Step3: Prune Expression Tables & Remove rows with no expression
     Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
     Sp1 = Sp1[rowSums (Sp1)!=0,]
     Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
     Sp2 = Sp2[rowSums (Sp2)!=0,]

     #Step4: Replace Gene names with Eggnog name
     Sp1[,ncol(Sp1)+1] = rownames(Sp1)
     colnames(Sp1)[ncol(Sp1)] = species1
     Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
     rownames(Sp1) = Sp1$eggnog
     Sp1 = Sp1[,3:ncol(Sp1)]

     Sp2[,ncol(Sp2)+1] = rownames(Sp2)
     colnames(Sp2)[ncol(Sp2)] = species2
     Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
     rownames(Sp2) = Sp2$eggnog
     Sp2 = Sp2[,3:ncol(Sp2)]

     #Step5: Scale Expression Tables by gene average
     avg = rowMeans(Sp1)
     Sp1 = sweep(Sp1,1,avg,"/")
     rm(avg)
     avg = rowMeans(Sp2)
     Sp2 = sweep(Sp2,1,avg,"/")
     rm(avg)

     #Step6: Merge Expression Tables
     geTable = merge(Sp1,Sp2, by='row.names', all=F)
     rownames(geTable) = geTable$Row.names
     geTable = geTable[,2:ncol(geTable)]

     #Step7:  Correlation
     Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
     rownames(Corr.Coeff.Table) = colnames(geTable)
     colnames(Corr.Coeff.Table) = colnames(geTable)

     p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
     rownames(p.value.table) = colnames(geTable)
     colnames(p.value.table) = colnames(geTable)

     a = combn(1:ncol(geTable),2)

     n4BHcorrect = ncol(geTable)*ncol(geTable)

     for (i in 1:ncol(a)){
          cell_type1 = colnames(geTable)[a[1,i]]
          cell_type2 = colnames(geTable)[a[2,i]]
          b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
          Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
          Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
          rm(list =c('cell_type1', 'cell_type2','b'))
     }
     neg.log10.p = -log10(p.value.table)
     list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
     names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value','DEgenes_union','DEgenes_in_analysis','nDEgenes_union','nDEgenes_in_analysis',paste('nDEgenes',species1,sep='_'),paste('nDEgenes',species2,sep='_'))
     return(list.to.return)

     }

#Specificity Using Intersect of Gene Lists
SpIntersect = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){

  #Step1:  Load eggnog tables
  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="mouse") {
    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
    DEgenesSpecies1 = toupper(DEgenesSpecies1)
    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
  }
  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="mouse") {
    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
    DEgenesSpecies2 = toupper(DEgenesSpecies2)
    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
  }
  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

  colnames(eggnog1) = eggnog1[1,]
  eggnog1 = eggnog1[2:nrow(eggnog1),]
  colnames(eggnog2) = eggnog2[1,]
  eggnog2 = eggnog2[2:nrow(eggnog2),]

  #Step2: Take union of DEgenes for analysis
  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
  nDESp1 = length(DEgenesSpecies1)
  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
  nDESp2 = length(DEgenesSpecies2)
  DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

  #Step3: Prune Expression Tables & Remove rows with no expression
  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
  Sp1 = Sp1[rowSums (Sp1)!=0,]
  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
  Sp2 = Sp2[rowSums (Sp2)!=0,]

  #Step4: Replace Gene names with Eggnog name
  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
  colnames(Sp1)[ncol(Sp1)] = species1
  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
  rownames(Sp1) = Sp1$eggnog
  Sp1 = Sp1[,3:ncol(Sp1)]

  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
  colnames(Sp2)[ncol(Sp2)] = species2
  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
  rownames(Sp2) = Sp2$eggnog
  Sp2 = Sp2[,3:ncol(Sp2)]

  #Step5: Scale Expression Tables by gene average
  avg = rowMeans(Sp1)
  Sp1 = sweep(Sp1,1,avg,"/")
  rm(avg)
  avg = rowMeans(Sp2)
  Sp2 = sweep(Sp2,1,avg,"/")
  rm(avg)

  #Step6: Merge Expression Tables
  geTable = merge(Sp1,Sp2, by='row.names', all=F)
  rownames(geTable) = geTable$Row.names
  geTable = geTable[,2:ncol(geTable)]

  #Step7:  Correlation
  Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
  rownames(Corr.Coeff.Table) = colnames(geTable)
  colnames(Corr.Coeff.Table) = colnames(geTable)

  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
  rownames(p.value.table) = colnames(geTable)
  colnames(p.value.table) = colnames(geTable)

  a = combn(1:ncol(geTable),2)

  n4BHcorrect = ncol(geTable)*ncol(geTable)

  for (i in 1:ncol(a)){
    cell_type1 = colnames(geTable)[a[1,i]]
    cell_type2 = colnames(geTable)[a[2,i]]
    b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
    Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
    Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
    BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
    p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
    p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
    rm(list =c('cell_type1', 'cell_type2','b'))
  }
  neg.log10.p = -log10(p.value.table)
  list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2)
  names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis',paste('nDEgenes',species1,sep='_'),paste('nDEgenes',species2,sep='_'))
  return(list.to.return)

}


####################################METHOD2#####################
####################################Rank Correlation#####
#scale to gene expression average

RankCorrelation = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){

     #Step1:  Load eggnog tables
     if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="mouse") {
          eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
          eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
          DEgenesSpecies2 = toupper(DEgenesSpecies2)
          rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
     }
     if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

     colnames(eggnog1) = eggnog1[1,]
     eggnog1 = eggnog1[2:nrow(eggnog1),]
     colnames(eggnog2) = eggnog2[1,]
     eggnog2 = eggnog2[2:nrow(eggnog2),]

     #Step2: Take union of DEgenes for analysis
     DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
     DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
     DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
     DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
     DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

     #Step3: Prune Expression Tables
     Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
     Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]

     #Step4: Replace Gene names with Eggnog name
     Sp1[,ncol(Sp1)+1] = rownames(Sp1)
     colnames(Sp1)[ncol(Sp1)] = species1
     Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
     rownames(Sp1) = Sp1$eggnog
     Sp1 = Sp1[,3:ncol(Sp1)]

     Sp2[,ncol(Sp2)+1] = rownames(Sp2)
     colnames(Sp2)[ncol(Sp2)] = species2
     Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
     rownames(Sp2) = Sp2$eggnog
     Sp2 = Sp2[,3:ncol(Sp2)]

     #Step5: Merge Expression Tables
     geTable = merge(Sp1,Sp2, by='row.names', all=F)
     rownames(geTable) = geTable$Row.names
     geTable = geTable[,2:ncol(geTable)]

     #Step6:  Correlation
     rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
     rownames(rho.table) = colnames(geTable)
     colnames(rho.table) = colnames(geTable)

     p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
     rownames(p.value.table) = colnames(geTable)
     colnames(p.value.table) = colnames(geTable)

     a = combn(1:ncol(geTable),2)

     n4BHcorrect = ncol(geTable)*ncol(geTable)

     for (i in 1:ncol(a)){
          cell_type1 = colnames(geTable)[a[1,i]]
          cell_type2 = colnames(geTable)[a[2,i]]
          b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='spearman'))
          rho.table[a[1,i],a[2,i]] = b$estimate
          rho.table[a[2,i],a[1,i]] = b$estimate
          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
          rm(list =c('cell_type1', 'cell_type2','b'))
     }
     neg.log10.p = -log10(p.value.table)
     list.to.return = list(rho.table,p.value.table,neg.log10.p)
     names(list.to.return) = c('rho','p.value','neg.log10.p.value')
     return(list.to.return)

}

####################Rank Correlation All Genes######


RankCorrelationAll = function(ExpressionTableSpecies1, species1 = c('turtle','lizard'), ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken')){

     #Step1:  Load eggnog tables
     if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="mouse") {
          eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
          eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
          rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
     }
     if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

     colnames(eggnog1) = eggnog1[1,]
     eggnog1 = eggnog1[2:nrow(eggnog1),]
     colnames(eggnog2) = eggnog2[1,]
     eggnog2 = eggnog2[2:nrow(eggnog2),]

     #Step2: Replace Gene names with Eggnog name
     Sp1 = ExpressionTableSpecies1
     Sp1[,ncol(Sp1)+1] = rownames(Sp1)
     colnames(Sp1)[ncol(Sp1)] = species1
     Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
     rownames(Sp1) = Sp1$eggnog
     Sp1 = Sp1[,3:ncol(Sp1)]

     Sp2 = ExpressionTableSpecies2
     Sp2[,ncol(Sp2)+1] = rownames(Sp2)
     colnames(Sp2)[ncol(Sp2)] = species2
     Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
     rownames(Sp2) = Sp2$eggnog
     Sp2 = Sp2[,3:ncol(Sp2)]

     #Step3: Merge Expression Tables
     geTable = merge(Sp1,Sp2, by='row.names', all=F)
     rownames(geTable) = geTable$Row.names
     geTable = geTable[,2:ncol(geTable)]

     #Step4:  Correlation
     rho.table = diag(x = 1, ncol(geTable), ncol(geTable))
     rownames(rho.table) = colnames(geTable)
     colnames(rho.table) = colnames(geTable)

     p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
     rownames(p.value.table) = colnames(geTable)
     colnames(p.value.table) = colnames(geTable)

     a = combn(1:ncol(geTable),2)

     n4BHcorrect = ncol(geTable)*ncol(geTable)

     for (i in 1:ncol(a)){
          cell_type1 = colnames(geTable)[a[1,i]]
          cell_type2 = colnames(geTable)[a[2,i]]
          b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='spearman'))
          rho.table[a[1,i],a[2,i]] = b$estimate
          rho.table[a[2,i],a[1,i]] = b$estimate
          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
          rm(list =c('cell_type1', 'cell_type2','b'))
     }
     neg.log10.p = -log10(p.value.table)
     list.to.return = list(rho.table,p.value.table,neg.log10.p)
     names(list.to.return) = c('rho','p.value','neg.log10.p.value')
     return(list.to.return)

}

########################METHOD 3#######
########################hypergeometric distribution#####


#Step 1:  generate list of markers for cell types
#requires FindMarkers.MAST function

Marker_List = function(SeuratObject,clusters,min.pct=0.4,thresh.use=log(1.5),fdr=0.05){
     #step1a generate pairwise DEgenes by MAST
     DEgenes = list()
     combinations = t(combn(as.character(clusters),2))
     colnames(combinations) <- c("ident.1","ident.2")
     pb   <- txtProgressBar(1, 100, style=3)
     for (i in 1:nrow(combinations)){
          a = combinations[i,1]
          b = combinations[i,2]
          suppressMessages(diff.genes <- FindMarkers.MAST(SeuratObject,a,b,min.pct,thresh.use))
          #matrix from FindMarkers.MAST only contains genes that have a fdr<0.05
          DEgenes[[paste("id1",a,"id2",b, sep="_")]] = rownames(diff.genes[diff.genes$avg_diff>0 & diff.genes$fdr<fdr,])
          DEgenes[[paste("id1",b,"id2",a, sep="_")]] = rownames(diff.genes[diff.genes$avg_diff<0 & diff.genes$fdr<fdr,])
          rm(list = c('diff.genes','a','b'))
          setTxtProgressBar(pb, (i*100)/nrow(combinations))
     }
     #step1b generate markers based on union of markers for a celltype
     markers = list()
     for (i in 1:length(clusters)) {
          sub.list = DEgenes[grep(paste('id1_',clusters[i], "_",sep=""),names(DEgenes))]
          gene.union = vector()
          for (j in 1:length(sub.list)){
               gene.union = union(gene.union,sub.list[[j]])
          }
          markers[[as.character(clusters)[i]]] = gene.union
          rm(list=c("sub.list","gene.union"))
     }
     return(list(DEgenes,markers))
}


#Step 2:  replace markers with eggnog names
#         and generate table with number of markers and number of eggnog markers per cell type

Eggnog_list_replace = function(genelist_list, species = c('turtle','lizard','mouse','human','chicken')){
     #Step1:  Load eggnog tables
     if (species=="turtle") eggnog = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species=="lizard") eggnog = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species=="mouse") {
          eggnog = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
          eggnog[2:nrow(eggnog),1] = toupper(eggnog[2:nrow(eggnog),1])
          for (i in 1:length(genelist_list)){
               genelist_list[[i]] = toupper(genelist_list[[i]])
          }
     }
     if (species=="human") eggnog = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
     if (species=="chicken") eggnog = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

     colnames(eggnog) = eggnog[1,]
     eggnog = eggnog[2:nrow(eggnog),]

     #Step2 replace names
     eggnog_list = list()
     for (i in 1:length(genelist_list)) {
          eggnog_list[[i]] = eggnog[eggnog[,1] %in% genelist_list[[i]],2]
          names(eggnog_list)[i] = names(genelist_list)[i]
     }

     #Step3 generate summary table to append to eggnog_list
     summary.matrix = cbind(sapply(genelist_list,length),sapply(eggnog_list,length))
     colnames(summary.matrix) = c('nMarkers','nMarkers_eggnog')
     rownames(summary.matrix) = names(genelist_list)
     eggnog_list[['summary']] = summary.matrix
     return(eggnog_list)
}

#Step 3:  p values for overlap

hygecdf_matrix = function(eggnog.list1, sp1='tur', eggnog.list2, sp2='liz'){
     names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
     names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
     summaries = list(eggnog.list1[[paste(sp1,'summary', sep="_")]], eggnog.list2[[paste(sp2,'summary', sep="_")]])
     eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
     eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
     gene.list = c(eggnog.list1, eggnog.list2)

     hyg = matrix(nrow=length(gene.list),ncol=length(gene.list))
     colnames(hyg) = c(names(gene.list))
     rownames(hyg) = c(names(gene.list))

     JSC = matrix(nrow=length(gene.list),ncol=length(gene.list))
     colnames(JSC) = c(names(gene.list))
     rownames(JSC) = c(names(gene.list))

     nGenes = vector()
     for(i in 1:length(gene.list)){
          nGenes = union(nGenes,gene.list[[i]])
     }
     nGenes = length(nGenes)
     #nGenes = 10089 #number of turtle v lizard eggnog genes
     for (i in 1:length(gene.list)){
          m = length(gene.list[[i]])
          n = nGenes - m
          n4BHcorrect = nrow(hyg)*ncol(hyg)
          for (j in 1:length(gene.list)){
               k = length(gene.list[[i]])
               q = length(intersect(gene.list[[i]],gene.list[[j]]))
               p.value = phyper(q,m,n,k, lower.tail=F)
               #BH correct p values in hyg
               p.value = p.adjust(p.value, method='BH',n=n4BHcorrect)
               hyg[i,j] = p.value
               u = length(union(gene.list[[i]],gene.list[[j]]))
               JSC[i,j] = q/u
               rm(list=c('k','q','u'))
          }
          rm(m)
     }
     negLog10 = -log10(hyg)
     return.list = list(hyg,negLog10,JSC,summaries)
     names(return.list) = c('hypergeometric_distribution','negative_log10','JSC','summaries')
     return(return.list)
}

#Just Calculate Overlap
overlap = function(eggnog.list1, sp1='sp1', eggnog.list2, sp2='sp2'){
     names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
     names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
     eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
     eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
     gene.list = c(eggnog.list1, eggnog.list2)

     overlap.matrix = matrix(nrow=length(gene.list),ncol=length(gene.list))
     colnames(overlap.matrix) = c(names(gene.list))
     rownames(overlap.matrix) = c(names(gene.list))

     a = combn(1:ncol(overlap.matrix),2)
     for (i in 1:ncol(a)){
          cell1 = a[1,i]
          cell2 = a[2,i]
          nGenes = length(intersect(gene.list[[cell1]],gene.list[[cell2]]))
          overlap.matrix[cell1,cell2] = nGenes
          overlap.matrix[cell2,cell1] = nGenes
     }

     for(i in 1:nrow(overlap.matrix)){
          overlap.matrix[i,i] = length(gene.list[[i]])
     }
     return(overlap.matrix)
}

#Fake Binarization

fake.binary = function(eggnog.list1, sp1='sp1', eggnog.list2, sp2='sp2'){
     names(eggnog.list1) = paste(sp1,names(eggnog.list1), sep="_")
     names(eggnog.list2) = paste(sp2,names(eggnog.list2), sep="_")
     eggnog.list1[paste(sp1,'summary', sep="_")] = NULL
     eggnog.list2[paste(sp2,'summary', sep="_")] = NULL
     gene.list = c(eggnog.list1, eggnog.list2)

     #step1: create list of markers for analysis
     markers.sp1 = vector()
     for(i in 1:length(eggnog.list1)){
          markers.sp1 = union(markers.sp1,eggnog.list1[[i]])
     }
     markers.sp2 = vector()
     for(i in 1:length(eggnog.list2)){
          markers.sp2 = union(markers.sp2,eggnog.list2[[i]])
     }

     #markers = union(markers.sp1,markers.sp2)
     markers = intersect(markers.sp1,markers.sp2)

     #step2:  create matrix of presence/absence of markers

     expression.matrix = data.frame(row.names=markers)
     for (i in 1:length(gene.list)){
          expression.matrix[rownames(expression.matrix) %in% gene.list[[i]],names(gene.list)[i]] = 1
          expression.matrix[!(rownames(expression.matrix) %in% gene.list[[i]]),names(gene.list)[i]] = 0
     }

     #step3:  correlation (pearson)
     Corr.Coeff.Table = diag(x = 1, length(gene.list), length(gene.list))
     rownames(Corr.Coeff.Table) = names(gene.list)
     colnames(Corr.Coeff.Table) = names(gene.list)

     p.value.table = matrix(ncol=length(gene.list), nrow = length(gene.list))
     rownames(p.value.table) = names(gene.list)
     colnames(p.value.table) = names(gene.list)

     a = combn(1:length(gene.list),2)

     n4BHcorrect = length(gene.list)*length(gene.list)

     for (i in 1:ncol(a)){
          cell_type1 = names(gene.list)[a[1,i]]
          cell_type2 = names(gene.list)[a[2,i]]
          b = suppressWarnings(cor.test(expression.matrix[,cell_type1], expression.matrix[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
          Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
          Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
          BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
          p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
          p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
          rm(list =c('cell_type1', 'cell_type2','b'))
     }
     neg.log10.p = -log10(p.value.table)

     #return
     list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p,expression.matrix)
     names(list.to.return) = c('correlation_coeff','p_values','neg_log10_p','expression_table')
     return(list.to.return)
}

##########################Old Scaling Methods###########
#scaling.method options
#'minmax' sets min at 0 and max at 1
#'max' expression as fraction of max
#'yao' expression according to Yao et al Cell Stem Cell 2017

ScaledCorrelation = function(scaling.method = c('minmax','max','yao'),ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2){

  #Step1:  Load eggnog tables
  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="mouse") {
    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
    DEgenesSpecies1 = toupper(DEgenesSpecies1)
    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
  }
  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="mouse") {
    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
    DEgenesSpecies2 = toupper(DEgenesSpecies2)
    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
  }
  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

  colnames(eggnog1) = eggnog1[1,]
  eggnog1 = eggnog1[2:nrow(eggnog1),]
  colnames(eggnog2) = eggnog2[1,]
  eggnog2 = eggnog2[2:nrow(eggnog2),]

  #Step2: Take union of DEgenes for analysis
  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
  DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

  #Step3: Prune Expression Tables & Remove rows with no expression
  ExpressionTableSpecies1 = ExpressionTableSpecies1[rowSums(ExpressionTableSpecies1)!=0,]
  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
  ExpressionTableSpecies2 = ExpressionTableSpecies2[rowSums(ExpressionTableSpecies2)!=0,]
  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]

  #Step4: Replace Gene names with Eggnog name
  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
  colnames(Sp1)[ncol(Sp1)] = species1
  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
  rownames(Sp1) = Sp1$eggnog
  Sp1 = Sp1[,3:ncol(Sp1)]

  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
  colnames(Sp2)[ncol(Sp2)] = species2
  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
  rownames(Sp2) = Sp2$eggnog
  Sp2 = Sp2[,3:ncol(Sp2)]

  #Step5: Scale Expression Tables by scaling.method
  if(scaling.method=='minmax'){
    gene_min = apply(Sp1, 1, min)
    Sp1 = sweep(Sp1,1,gene_min,"-")
    gene_max = apply(Sp1, 1, max)
    Sp1 = sweep(Sp1,1,gene_max,"/")
    rm(list=c('gene_min','gene_max'))
    gene_min = apply(Sp2, 1, min)
    Sp2 = sweep(Sp2,1,gene_min,"-")
    gene_max = apply(Sp2, 1, max)
    Sp2 = sweep(Sp2,1,gene_max,"/")
    rm(list=c('gene_min','gene_max'))
  }

  if(scaling.method=='max'){
    gene_max = apply(Sp1, 1, max)
    Sp1 = sweep(Sp1,1,gene_max,"/")
    rm(gene_max)
    gene_max = apply(Sp2, 1, max)
    Sp2 = sweep(Sp2,1,gene_max,"/")
    rm(gene_max)

  }

  if(scaling.method=='yao'){
      gene_max = apply(Sp1, 1, max)
      gene_min = apply(Sp1, 1, min)
      #Determine median for entire dataset, not just pruned data!!!
      med = apply(ExpressionTableSpecies1,1,max)
      med = median(med)
      for (i in 1:nrow(Sp1)) {
        if (gene_max[i]>med){
          Sp1[i,] = sapply(Sp1[i,],function(x)(x-gene_min[i])/gene_max[i])
        }
        else {
          Sp1[i,] = sapply(Sp1[i,],function(x)(x-gene_min[i])/med)
        }
      }
      rm(list=c('gene_max','gene_min','med'))
      gene_max = apply(Sp2, 1, max)
      gene_min = apply(Sp2, 1, min)
      med = apply(ExpressionTableSpecies2,1,max)
      med = median(med)
      for (i in 1:nrow(Sp2)) {
        if (gene_max[i]>med){
          Sp2[i,] = sapply(Sp2[i,],function(x)(x-gene_min[i])/gene_max[i])
        }
        else {
          Sp2[i,] = sapply(Sp2[i,],function(x)(x-gene_min[i])/med)
        }
      }
      rm(list=c('gene_max','gene_min','med'))

  }

  #Step6: Merge Expression Tables
  geTable = merge(Sp1,Sp2, by='row.names', all=F)
  rownames(geTable) = geTable$Row.names
  geTable = geTable[,2:ncol(geTable)]

  #Step7:  Correlation
  Corr.Coeff.Table = diag(x = 1, ncol(geTable), ncol(geTable))
  rownames(Corr.Coeff.Table) = colnames(geTable)
  colnames(Corr.Coeff.Table) = colnames(geTable)

  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
  rownames(p.value.table) = colnames(geTable)
  colnames(p.value.table) = colnames(geTable)

  a = combn(1:ncol(geTable),2)

  n4BHcorrect = ncol(geTable)*ncol(geTable)

  for (i in 1:ncol(a)){
    cell_type1 = colnames(geTable)[a[1,i]]
    cell_type2 = colnames(geTable)[a[2,i]]
    b = suppressWarnings(cor.test(geTable[,cell_type1], geTable[,cell_type2], adjust='none',alternative = "two.sided", method='pearson'))
    Corr.Coeff.Table[a[1,i],a[2,i]] = b$estimate
    Corr.Coeff.Table[a[2,i],a[1,i]] = b$estimate
    BHcorrected.p.value = p.adjust(b$p.value, method='BH',n=n4BHcorrect)
    p.value.table[a[1,i],a[2,i]] = BHcorrected.p.value
    p.value.table[a[2,i],a[1,i]] = BHcorrected.p.value
    rm(list =c('cell_type1', 'cell_type2','b'))
  }
  neg.log10.p = -log10(p.value.table)
  list.to.return = list(Corr.Coeff.Table,p.value.table,neg.log10.p)
  names(list.to.return) = c('corr.coeff','p.value','neg.log10.p.value')
  return(list.to.return)

}