Comparative_functions.R


################# General Correlation (choose method), Permutation Test, all genes ################

# requires:
# tables with the gene functional annotations generated by eggnog

# input:
# ExpressionTableSpecies1 - must be a data.frame
# species1 = c('turtle','lizard','mouse')
# DEgenesSpecies1
# ExpressionTableSpecies2 - must be a data.frame
# species1 = c('turtle','lizard','mouse','human','chicken')
# DEgenesSpecies2
# nPermutations = 1000
# genes.use = c('intersect','union','species1','species2')
# corr.method = c('spearman', 'pearson') etc.

# returns:
# list with
# corr.coeff
# shuffled_correlation_score_means
# p.value
# negative_log10_p.value
# DEgenes_intersect
# DEgenes_in_analysis
# nDEgenes_intersect
# nDEgenes_in_analysis
# nDEgenes_Sp1
# nDEgenes_Sp2

SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){

  #Step1:  Load eggnog tables
  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="mouse") {
    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
    DEgenesSpecies1 = toupper(DEgenesSpecies1)
    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
  }
  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="mouse") {
    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
    DEgenesSpecies2 = toupper(DEgenesSpecies2)
    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
  }
  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

  colnames(eggnog1) = eggnog1[1,]
  eggnog1 = eggnog1[2:nrow(eggnog1),]
  colnames(eggnog2) = eggnog2[1,]
  eggnog2 = eggnog2[2:nrow(eggnog2),]

  #Step2: Take intersect, union, species1 or species2 of DEgenes for analysis

  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
  nDESp1 = length(DEgenesSpecies1)
  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
  nDESp2 = length(DEgenesSpecies2)
  #genes.use = c('intersect','union','species1','species2')
  if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
  if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
  if (genes.use=='species1') DEgenes = DEgenesSpecies1
  if (genes.use=='species2') DEgenes = DEgenesSpecies2
  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

  #Step3: Prune Expression Tables & Remove rows with no expression
  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
  Sp1 = Sp1[rowSums (Sp1)!=0,]
  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
  Sp2 = Sp2[rowSums (Sp2)!=0,]

  #Step4: Replace Gene names with Eggnog name
  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
  colnames(Sp1)[ncol(Sp1)] = species1
  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
  rownames(Sp1) = Sp1$eggnog
  Sp1 = Sp1[,3:ncol(Sp1)]

  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
  colnames(Sp2)[ncol(Sp2)] = species2
  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
  rownames(Sp2) = Sp2$eggnog
  Sp2 = Sp2[,3:ncol(Sp2)]

  #Step5: Scale Expression Tables by gene average
  avg = rowMeans(Sp1)
  Sp1 = sweep(Sp1,1,avg,"/")
  rm(avg)
  avg = rowMeans(Sp2)
  Sp2 = sweep(Sp2,1,avg,"/")
  rm(avg)

  #Step6: Merge Expression Tables
  geTable = merge(Sp1,Sp2, by='row.names', all=F)
  rownames(geTable) = geTable$Row.names
  geTable = geTable[,2:ncol(geTable)]

  #Step7:  Correlation
  #7a:  Correlation
  Corr.Coeff.Table = cor(geTable,method=corr.method)

  #7b:  Shuffle data
  shuffled.cor.list = list()
  pb   <- txtProgressBar(1, 100, style=3)

  for (i in 1:nPermutations){
    shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
    shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
    shuffled = cbind(t(shuffled),t(shuffled2))
    shuffled.cor = cor(shuffled,method=corr.method)
    shuffled.cor.list[[i]] = shuffled.cor
    rm(list=c('shuffled','shuffled2','shuffled.cor'))
    if ((i %% 100) ==0){
      setTxtProgressBar(pb, (i*100)/nPermutations)
    }
  }

  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
  rownames(p.value.table) = colnames(geTable)
  colnames(p.value.table) = colnames(geTable)

  shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
  rownames(shuffled.mean.table) = colnames(geTable)
  colnames(shuffled.mean.table) = colnames(geTable)

  a = combn(1:ncol(geTable),2)
  for (i in 1:ncol(a)){
    cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
    shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
    shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
    p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
    p.value.table[a[1,i],a[2,i]] = p.value
    p.value.table[a[2,i],a[1,i]] = p.value
    rm(list=c('cor.scores','p.value'))
    setTxtProgressBar(pb, (i*100)/ncol(a))
  }

  neg.log10.p = -log10(p.value.table)

  #step8 "Overlap in Markers"
  #for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells

  #from above a = combn(1:ncol(geTable),2)
  marker.overlap.list = list()
  for (i in 1:ncol(a)){
    datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]])
    markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,])
    marker.overlap.list[[i]] = markers
    names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_')
    rm(list=c('datasubset','markers'))
  }

  list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list)
  names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers')
  return(list.to.return)

}


############ General Correlation (choose method), Permutation Test, transcription factors #############

# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563	(transcriptional activator activity) and GO:0016564	(transcriptional repressor activity).

# requires:
# tables with the gene functional annotations generated by eggnog
# list of transcription factors

# input and output as above

SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){

  #Step1:  Load eggnog tables
  if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species1=="mouse") {
    eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
    DEgenesSpecies1 = toupper(DEgenesSpecies1)
    rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
  }
  if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="mouse") {
    eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
    eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
    DEgenesSpecies2 = toupper(DEgenesSpecies2)
    rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
  }
  if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
  if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

  colnames(eggnog1) = eggnog1[1,]
  eggnog1 = eggnog1[2:nrow(eggnog1),]
  colnames(eggnog2) = eggnog2[1,]
  eggnog2 = eggnog2[2:nrow(eggnog2),]

  #Step2: Take intersect, union, species1 or species2 of DEgenes for analysis

  DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
  nDESp1 = length(DEgenesSpecies1)
  DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
  nDESp2 = length(DEgenesSpecies2)
  #genes.use = c('intersect','union','species1','species2')
  if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
  if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
  if (genes.use=='species1') DEgenes = DEgenesSpecies1
  if (genes.use=='species2') DEgenes = DEgenesSpecies2
  DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
  DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

  #Step3: Prune Expression Tables & Remove rows with no expression
  Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
  Sp1 = Sp1[rowSums (Sp1)!=0,]
  Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
  Sp2 = Sp2[rowSums (Sp2)!=0,]

  #Step4: Replace Gene names with Eggnog name
  Sp1[,ncol(Sp1)+1] = rownames(Sp1)
  colnames(Sp1)[ncol(Sp1)] = species1
  Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
  rownames(Sp1) = Sp1$eggnog
  Sp1 = Sp1[,3:ncol(Sp1)]

  Sp2[,ncol(Sp2)+1] = rownames(Sp2)
  colnames(Sp2)[ncol(Sp2)] = species2
  Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
  rownames(Sp2) = Sp2$eggnog
  Sp2 = Sp2[,3:ncol(Sp2)]

  # Step5: Take only transcription factors
  TFs_list <- read.table("170712_TFs_list_ensembl.txt")
  TFs_list <- as.character(TFs_list[,1])
  Sp1 <- Sp1[rownames(Sp1) %in% TFs_list,]
  Sp2 <- Sp2[rownames(Sp2) %in% TFs_list,]

  #Step6: Scale Expression Tables by gene average
  avg = rowMeans(Sp1)
  Sp1 = sweep(Sp1,1,avg,"/")
  rm(avg)
  avg = rowMeans(Sp2)
  Sp2 = sweep(Sp2,1,avg,"/")
  rm(avg)

  #Step7: Merge Expression Tables
  geTable = merge(Sp1,Sp2, by='row.names', all=F)
  rownames(geTable) = geTable$Row.names
  geTable = geTable[,2:ncol(geTable)]

  #Step8:  Correlation
  #8a:  Correlation
  Corr.Coeff.Table = cor(geTable,method=corr.method)

  #8b:  Shuffle data
  shuffled.cor.list = list()
  pb   <- txtProgressBar(1, 100, style=3)

  for (i in 1:nPermutations){
    shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
    shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
    shuffled = cbind(t(shuffled),t(shuffled2))
    shuffled.cor = cor(shuffled,method=corr.method)
    shuffled.cor.list[[i]] = shuffled.cor
    rm(list=c('shuffled','shuffled2','shuffled.cor'))
    if ((i %% 100) ==0){
      setTxtProgressBar(pb, (i*100)/nPermutations)
    }
  }

  p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
  rownames(p.value.table) = colnames(geTable)
  colnames(p.value.table) = colnames(geTable)

  shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
  rownames(shuffled.mean.table) = colnames(geTable)
  colnames(shuffled.mean.table) = colnames(geTable)

  a = combn(1:ncol(geTable),2)
  for (i in 1:ncol(a)){
    cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
    shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
    shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
    p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
    p.value.table[a[1,i],a[2,i]] = p.value
    p.value.table[a[2,i],a[1,i]] = p.value
    rm(list=c('cor.scores','p.value'))
    setTxtProgressBar(pb, (i*100)/ncol(a))
  }

  neg.log10.p = -log10(p.value.table)

  #step9 "Overlap in Markers"
  #for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells

  #from above a = combn(1:ncol(geTable),2)
  marker.overlap.list = list()
  for (i in 1:ncol(a)){
    datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]])
    markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,])
    marker.overlap.list[[i]] = markers
    names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_')
    rm(list=c('datasubset','markers'))
  }

  list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list)
  names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers')
  return(list.to.return)

}


################# NoisyGenes function ################

# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy, and can be removed before running comparative analysis
# clusters.use = character vectors of the clusters ids to include in the analysis
# Example for comparative analysis:
# object_noisy_genes <- NoisyGenes(object, min.pct=0.2, clusters.use=as.character(levels(object@ident)))
# object_avg_expr <- AverageExpression(object)
# ExpressionTableSpecies1 <- object_avg_expr[!rownames(object_avg_expr) %in% object_noisy_genes,]

NoisyGenes <- function(object, min.pct=0.2, clusters.use) {
	clusters <- clusters.use
	genes.use = rownames(object@data)
	object.raw.data <- as.matrix(object@raw.data)
	pct.matrix = matrix(data=NA, nrow=length(genes.use), ncol=length(clusters))
	rownames(pct.matrix) <- genes.use
	colnames(pct.matrix) <- clusters
	thresh.min=0
	for (i in clusters){
		cells.cluster <- names(object@ident[object@ident==i])
		data.cluster <- object.raw.data[,colnames(object.raw.data) %in% cells.cluster]
		pct.cluster <- round(apply(object.raw.data[genes.use, cells.cluster, drop = F],1,function(x)return(length(x[x>thresh.min])/length(x))),3)
		pct.matrix[,i] <- pct.cluster
	}
	pct.max <- rowMaxs(pct.matrix)
	names(pct.max) <- genes.use
	noisy.genes <- names(pct.max[pct.max < min.pct])
	return(noisy.genes)
}

	################# General Correlation (choose method), Permutation Test, all genes ################

	# requires:
	# tables with the gene functional annotations generated by eggnog

	# input:
	# ExpressionTableSpecies1 - must be a data.frame
	# species1 = c('turtle','lizard','mouse')
	# DEgenesSpecies1
	# ExpressionTableSpecies2 - must be a data.frame
	# species1 = c('turtle','lizard','mouse','human','chicken')
	# DEgenesSpecies2
	# nPermutations = 1000
	# genes.use = c('intersect','union','species1','species2')
	# corr.method = c('spearman', 'pearson') etc.

	# returns:
	# list with
	# corr.coeff
	# shuffled_correlation_score_means
	# p.value
	# negative_log10_p.value
	# DEgenes_intersect
	# DEgenes_in_analysis
	# nDEgenes_intersect
	# nDEgenes_in_analysis
	# nDEgenes_Sp1
	# nDEgenes_Sp2

	SpPermute = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){

	#Step1: Load eggnog tables
	if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species1=="mouse") {
	eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
	DEgenesSpecies1 = toupper(DEgenesSpecies1)
	rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
	}
	if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species2=="mouse") {
	eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
	DEgenesSpecies2 = toupper(DEgenesSpecies2)
	rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
	}
	if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

	colnames(eggnog1) = eggnog1[1,]
	eggnog1 = eggnog1[2:nrow(eggnog1),]
	colnames(eggnog2) = eggnog2[1,]
	eggnog2 = eggnog2[2:nrow(eggnog2),]

	#Step2: Take intersect, union, species1 or species2 of DEgenes for analysis

	DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
	nDESp1 = length(DEgenesSpecies1)
	DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
	nDESp2 = length(DEgenesSpecies2)
	#genes.use = c('intersect','union','species1','species2')
	if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
	if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
	if (genes.use=='species1') DEgenes = DEgenesSpecies1
	if (genes.use=='species2') DEgenes = DEgenesSpecies2
	DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
	DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

	#Step3: Prune Expression Tables & Remove rows with no expression
	Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
	Sp1 = Sp1[rowSums (Sp1)!=0,]
	Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
	Sp2 = Sp2[rowSums (Sp2)!=0,]

	#Step4: Replace Gene names with Eggnog name
	Sp1[,ncol(Sp1)+1] = rownames(Sp1)
	colnames(Sp1)[ncol(Sp1)] = species1
	Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
	rownames(Sp1) = Sp1$eggnog
	Sp1 = Sp1[,3:ncol(Sp1)]

	Sp2[,ncol(Sp2)+1] = rownames(Sp2)
	colnames(Sp2)[ncol(Sp2)] = species2
	Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
	rownames(Sp2) = Sp2$eggnog
	Sp2 = Sp2[,3:ncol(Sp2)]

	#Step5: Scale Expression Tables by gene average
	avg = rowMeans(Sp1)
	Sp1 = sweep(Sp1,1,avg,"/")
	rm(avg)
	avg = rowMeans(Sp2)
	Sp2 = sweep(Sp2,1,avg,"/")
	rm(avg)

	#Step6: Merge Expression Tables
	geTable = merge(Sp1,Sp2, by='row.names', all=F)
	rownames(geTable) = geTable$Row.names
	geTable = geTable[,2:ncol(geTable)]

	#Step7: Correlation
	#7a: Correlation
	Corr.Coeff.Table = cor(geTable,method=corr.method)

	#7b: Shuffle data
	shuffled.cor.list = list()
	pb <- txtProgressBar(1, 100, style=3)

	for (i in 1:nPermutations){
	shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
	shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
	shuffled = cbind(t(shuffled),t(shuffled2))
	shuffled.cor = cor(shuffled,method=corr.method)
	shuffled.cor.list[[i]] = shuffled.cor
	rm(list=c('shuffled','shuffled2','shuffled.cor'))
	if ((i %% 100) ==0){
	setTxtProgressBar(pb, (i*100)/nPermutations)
	}
	}

	p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
	rownames(p.value.table) = colnames(geTable)
	colnames(p.value.table) = colnames(geTable)

	shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
	rownames(shuffled.mean.table) = colnames(geTable)
	colnames(shuffled.mean.table) = colnames(geTable)

	a = combn(1:ncol(geTable),2)
	for (i in 1:ncol(a)){
	cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
	shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
	shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
	p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
	p.value.table[a[1,i],a[2,i]] = p.value
	p.value.table[a[2,i],a[1,i]] = p.value
	rm(list=c('cor.scores','p.value'))
	setTxtProgressBar(pb, (i*100)/ncol(a))
	}

	neg.log10.p = -log10(p.value.table)

	#step8 "Overlap in Markers"
	#for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells

	#from above a = combn(1:ncol(geTable),2)
	marker.overlap.list = list()
	for (i in 1:ncol(a)){
	datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]])
	markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,])
	marker.overlap.list[[i]] = markers
	names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_')
	rm(list=c('datasubset','markers'))
	}

	list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list)
	names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers')
	return(list.to.return)

	}


	############ General Correlation (choose method), Permutation Test, transcription factors #############

	# includes only transcription factors in the comparison. TF list from ensembl, file 170712_TFs_list_ensembl.txt, using GO terms GO:0003700 (transcription factor activity), GO:0003702 (RNA polymerase II transcription factor activity), GO:0003709 (RNA polymerase III transcription factor activity), GO:0016563 (transcriptional activator activity) and GO:0016564 (transcriptional repressor activity).

	# requires:
	# tables with the gene functional annotations generated by eggnog
	# list of transcription factors

	# input and output as above

	SpPermuteTF = function(ExpressionTableSpecies1, species1 = c('turtle','lizard','mouse'), DEgenesSpecies1, ExpressionTableSpecies2, species2 = c('turtle','lizard','mouse','human','chicken'), DEgenesSpecies2, nPermutations = 1000, genes.use='intersect', corr.method = 'spearman'){

	#Step1: Load eggnog tables
	if (species1=="turtle") eggnog1 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species1=="lizard") eggnog1 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species1=="mouse") {
	eggnog1 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	eggnog1[2:nrow(eggnog1),1] = toupper(eggnog1[2:nrow(eggnog1),1])
	DEgenesSpecies1 = toupper(DEgenesSpecies1)
	rownames(ExpressionTableSpecies1) = toupper(rownames(ExpressionTableSpecies1))
	}
	if (species2=="turtle") eggnog2 = read.table('chrysemys_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species2=="lizard") eggnog2 = read.table('pogona_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species2=="mouse") {
	eggnog2 = read.table('mus_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	eggnog2[2:nrow(eggnog2),1] = toupper(eggnog2[2:nrow(eggnog2),1])
	DEgenesSpecies2 = toupper(DEgenesSpecies2)
	rownames(ExpressionTableSpecies2) = toupper(rownames(ExpressionTableSpecies2))
	}
	if (species2=="human") eggnog2 = read.table('homo_eggnog_pruned.txt',header=F,stringsAsFactors = F)
	if (species2=="chicken") eggnog2 = read.table('gallus_eggnog_pruned.txt',header=F,stringsAsFactors = F)

	colnames(eggnog1) = eggnog1[1,]
	eggnog1 = eggnog1[2:nrow(eggnog1),]
	colnames(eggnog2) = eggnog2[1,]
	eggnog2 = eggnog2[2:nrow(eggnog2),]

	#Step2: Take intersect, union, species1 or species2 of DEgenes for analysis

	DEgenesSpecies1 = eggnog1[eggnog1[,1] %in% DEgenesSpecies1,2]
	nDESp1 = length(DEgenesSpecies1)
	DEgenesSpecies2 = eggnog2[eggnog2[,1] %in% DEgenesSpecies2,2]
	nDESp2 = length(DEgenesSpecies2)
	#genes.use = c('intersect','union','species1','species2')
	if (genes.use=='intersect') DEgenes = intersect(DEgenesSpecies1,DEgenesSpecies2)
	if (genes.use=='union') DEgenes = union(DEgenesSpecies1,DEgenesSpecies2)
	if (genes.use=='species1') DEgenes = DEgenesSpecies1
	if (genes.use=='species2') DEgenes = DEgenesSpecies2
	DEgenesSpecies1 = eggnog1[eggnog1[,2] %in% DEgenes,1]
	DEgenesSpecies2 = eggnog2[eggnog2[,2] %in% DEgenes,1]

	#Step3: Prune Expression Tables & Remove rows with no expression
	Sp1 = ExpressionTableSpecies1[rownames(ExpressionTableSpecies1) %in% DEgenesSpecies1,]
	Sp1 = Sp1[rowSums (Sp1)!=0,]
	Sp2 = ExpressionTableSpecies2[rownames(ExpressionTableSpecies2) %in% DEgenesSpecies2,]
	Sp2 = Sp2[rowSums (Sp2)!=0,]

	#Step4: Replace Gene names with Eggnog name
	Sp1[,ncol(Sp1)+1] = rownames(Sp1)
	colnames(Sp1)[ncol(Sp1)] = species1
	Sp1 = merge(eggnog1, Sp1, by =species1, all = F)
	rownames(Sp1) = Sp1$eggnog
	Sp1 = Sp1[,3:ncol(Sp1)]

	Sp2[,ncol(Sp2)+1] = rownames(Sp2)
	colnames(Sp2)[ncol(Sp2)] = species2
	Sp2 = merge(eggnog2, Sp2, by =species2, all = F)
	rownames(Sp2) = Sp2$eggnog
	Sp2 = Sp2[,3:ncol(Sp2)]

	# Step5: Take only transcription factors
	TFs_list <- read.table("170712_TFs_list_ensembl.txt")
	TFs_list <- as.character(TFs_list[,1])
	Sp1 <- Sp1[rownames(Sp1) %in% TFs_list,]
	Sp2 <- Sp2[rownames(Sp2) %in% TFs_list,]

	#Step6: Scale Expression Tables by gene average
	avg = rowMeans(Sp1)
	Sp1 = sweep(Sp1,1,avg,"/")
	rm(avg)
	avg = rowMeans(Sp2)
	Sp2 = sweep(Sp2,1,avg,"/")
	rm(avg)

	#Step7: Merge Expression Tables
	geTable = merge(Sp1,Sp2, by='row.names', all=F)
	rownames(geTable) = geTable$Row.names
	geTable = geTable[,2:ncol(geTable)]

	#Step8: Correlation
	#8a: Correlation
	Corr.Coeff.Table = cor(geTable,method=corr.method)

	#8b: Shuffle data
	shuffled.cor.list = list()
	pb <- txtProgressBar(1, 100, style=3)

	for (i in 1:nPermutations){
	shuffled = apply(geTable[,1:ncol(Sp1)],1,sample)
	shuffled2 = apply(geTable[,(ncol(Sp1)+1):ncol(geTable)],1,sample)
	shuffled = cbind(t(shuffled),t(shuffled2))
	shuffled.cor = cor(shuffled,method=corr.method)
	shuffled.cor.list[[i]] = shuffled.cor
	rm(list=c('shuffled','shuffled2','shuffled.cor'))
	if ((i %% 100) ==0){
	setTxtProgressBar(pb, (i*100)/nPermutations)
	}
	}

	p.value.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
	rownames(p.value.table) = colnames(geTable)
	colnames(p.value.table) = colnames(geTable)

	shuffled.mean.table = matrix(ncol=ncol(geTable), nrow = ncol(geTable))
	rownames(shuffled.mean.table) = colnames(geTable)
	colnames(shuffled.mean.table) = colnames(geTable)

	a = combn(1:ncol(geTable),2)
	for (i in 1:ncol(a)){
	cor.scores = sapply(shuffled.cor.list,"[",a[1,i],a[2,i])
	shuffled.mean.table[a[1,i],a[2,i]] = mean(cor.scores)
	shuffled.mean.table[a[2,i],a[1,i]] = mean(cor.scores)
	p.value = mean(abs(cor.scores)>=abs(Corr.Coeff.Table[a[1,i],a[2,i]]))
	p.value.table[a[1,i],a[2,i]] = p.value
	p.value.table[a[2,i],a[1,i]] = p.value
	rm(list=c('cor.scores','p.value'))
	setTxtProgressBar(pb, (i*100)/ncol(a))
	}

	neg.log10.p = -log10(p.value.table)

	#step9 "Overlap in Markers"
	#for all pairs of cell-types generate list of genes that are at least 1.5x avg in both cells

	#from above a = combn(1:ncol(geTable),2)
	marker.overlap.list = list()
	for (i in 1:ncol(a)){
	datasubset = cbind(geTable[,a[1,i]],geTable[,a[2,i]])
	markers = rownames(geTable[datasubset[,1]>1.5 & datasubset[,2]>1.5,])
	marker.overlap.list[[i]] = markers
	names(marker.overlap.list)[i] = paste(colnames(geTable)[a[1,i]], colnames(geTable)[a[2,i]],sep='_')
	rm(list=c('datasubset','markers'))
	}

	list.to.return = list(Corr.Coeff.Table,shuffled.mean.table,p.value.table,neg.log10.p,DEgenes,rownames(geTable),length(DEgenes),length(rownames(geTable)),nDESp1,nDESp2,geTable,marker.overlap.list)
	names(list.to.return) = c('corr.coeff','shuffled_correlation_score_means','p.value','negative_log10_p.value','DEgenes_intersect','DEgenes_in_analysis','nDEgenes_intersect','nDEgenes_in_analysis','nDEgenes_Sp1','nDEgenes_Sp2','scaled_table','overlapping_markers')
	return(list.to.return)

	}


	################# NoisyGenes function ################

	# this function finds those genes that are not expressed above a certain percentage of cells (min.pct) in all of the clusters. Genes identified by this function are too noisy, and can be removed before running comparative analysis
	# clusters.use = character vectors of the clusters ids to include in the analysis
	# Example for comparative analysis:
	# object_noisy_genes <- NoisyGenes(object, min.pct=0.2, clusters.use=as.character(levels(object@ident)))
	# object_avg_expr <- AverageExpression(object)
	# ExpressionTableSpecies1 <- object_avg_expr[!rownames(object_avg_expr) %in% object_noisy_genes,]

	NoisyGenes <- function(object, min.pct=0.2, clusters.use) {
	clusters <- clusters.use
	genes.use = rownames(object@data)
	object.raw.data <- as.matrix(object@raw.data)
	pct.matrix = matrix(data=NA, nrow=length(genes.use), ncol=length(clusters))
	rownames(pct.matrix) <- genes.use
	colnames(pct.matrix) <- clusters
	thresh.min=0
	for (i in clusters){
	cells.cluster <- names(object@ident[object@ident==i])
	data.cluster <- object.raw.data[,colnames(object.raw.data) %in% cells.cluster]
	pct.cluster <- round(apply(object.raw.data[genes.use, cells.cluster, drop = F],1,function(x)return(length(x[x>thresh.min])/length(x))),3)
	pct.matrix[,i] <- pct.cluster
	}
	pct.max <- rowMaxs(pct.matrix)
	names(pct.max) <- genes.use
	noisy.genes <- names(pct.max[pct.max < min.pct])
	return(noisy.genes)
	}