Testing network functionality

loosolab · Apr 11, 2019 · 6c05106 · 6c05106
1 parent 7d28fab
commit 6c05106
Showing 2 changed files with 269 additions and 1 deletion.
diff --git a/tobias/TOBIAS.py b/tobias/TOBIAS.py
@@ -41,7 +41,7 @@
 from tobias.misc.subsample_bam import *
 from tobias.misc.merge_pdfs import *
 from tobias.misc.maxpos import *
-#from tobias.misc.create_network import *
+from tobias.misc.create_network import *
 from tobias.misc.log2table import *
 
 from tobias import __version__ as TOBIAS_VERSION

diff --git a/tobias/misc/create_network.py b/tobias/misc/create_network.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+
+"""
+CreateNetwork: Creates a TF-TF gene regulation network from transcription factor binding sites
+
+@author: Mette Bentsen
+@contact: mette.bentsen (at) mpi-bn.mpg.de
+@license: MIT
+
+"""
+
+import os
+import sys
+import argparse
+import pyBigWig
+import numpy as np
+import glob
+
+#import networkx as nx
+import matplotlib.pyplot as plt
+import pandas as pd
+
+#Utils from TOBIAS
+from tobias.utils.utilities import * 
+from tobias.utils.logger import *
+
+
+#--------------------------------------------------------------------------------#
+#--------------------------------------------------------------------------------#
+
+def add_network_arguments(parser):
+
+	parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90)
+	description = "" 
+	parser.description = format_help_description("CreateNetwork", description)
+
+	parser._action_groups.pop()	#pop -h
+
+	#Required arguments
+	required = parser.add_argument_group('Required arguments')
+	required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect") #coordinates in .bed format with TF name in 4th column and a column containing target genes (set using --target)")
+	required.add_argument('--origin', metavar="", help="File of origins of TF to genes")
+
+	#Optional arguments
+	optional = parser.add_argument_group('Optional arguments')
+	required.add_argument('--subset', metavar="", help="File containing subset of names to filter on")
+	optional.add_argument('--TFBS_columns', metavar="", nargs="*", help="Donor -> recipient columns", default=["TFBS_name", "gene_id"])
+	optional.add_argument('--origin_columns', metavar="", nargs="*", help="Name, id", default=["TOBIAS_motif_name", "ensembl_gene_id"])
+
+	optional.add_argument('--expression', metavar="", help="Expression of each gene per timepoint")
+	optional.add_argument('--output', metavar="", help="Path to output directory (default: tobias_network)", default="tobias_network") 
+
+	return(parser)
+
+#--------------------------------------------------------------------------------#
+
+def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}):
+	last_node = path[-1] 
+	if last_node in adjacency:
+		target_nodes = adjacency[last_node].get("targets", [])
+		if len(target_nodes) > 0:
+			for target_node in target_nodes:
+				#print("node {0} has target node {1}".format(last_node, target_node))
+				if target_node in path:    #cyclic
+					pass  #next target without saving path
+				else:
+					#if any time overlaps with this or next stage
+					allowed = set([timeline[-1], timeline[-1] +1])
+					observed = adjacency[last_node]["targets"][target_node]["bound_in"]
+					if len(allowed.intersection(observed)) > 0:
+						new_path = path + [target_node]
+						new_timeline = timeline + [min(observed)]
+						all_paths = dfs(adjacency, new_path, new_timeline, all_paths)
+		else:
+			if len(path) > 2:
+				#print("no targets for {0} - ending path".format(last_node))
+				all_paths["paths"] += [path] 
+				all_paths["timelines"] += [timeline]
+	else:
+		if len(path) > 2:
+			#print("{0} doesnt have targets".format(last_node))
+			all_paths["paths"] += [path]
+			all_paths["timelines"] += [timeline]
+
+	return all_paths
+
+
+#--------------------------------------------------------------------------------#
+def run_network(args):
+
+	make_directory(args.output)
+
+
+	#-------------------------- Origin file translating motif name -> gene origin -----------------------------------#
+	#translation file, where one motif can constitute more than one gene (jun::fos) 
+	#and more genes can encode transcription factors with same motifs (close family members with same target sequence)
+	origin_table = pd.read_csv(args.origin, sep="\t")
+	origin_table = origin_table[args.origin_columns]
+
+	TFBS_donor, TFBS_recipient = args.TFBS_columns
+	origin_name, origin_gene = args.origin_columns
+
+	id2name = {gene_id: names_list for gene_id, names_list in origin_table.groupby(origin_gene)[origin_name].apply(list).iteritems()}
+
+	TFBS_donor, TFBS_recipient = args.TFBS_columns
+	origin_name, origin_gene = args.origin_columns
+
+
+	#---------------------------------------------- BINDetect results --------------------------------------------#
+
+	#Get all overview files from TFBS dir
+	print("Getting files from {0}".format(args.TFBS))
+	overview_files = [f for f in glob.glob(args.TFBS + "/*/*_overview.txt")]
+	print("- Found results from {0} motifs".format(len(overview_files)))
+
+	#Subset on names
+	if args.subset != None:
+		print("Subsetting on names from {0}".format(args.subset))
+		subset_names = open(args.subset).read().rstrip().split()
+		print("Subset: {0}".format(subset_names))
+
+		overview_files = list(set(sum(match_lists([subset_names, overview_files])[0], [])))
+		print("Subset to {0} files".format(len(overview_files)))
+
+	#Read all edges to table
+	print("Reading all overview tables")
+	dataframes = []
+	for fil in overview_files[:5]:
+		print(fil)
+
+		df = pd.read_csv(fil, sep="\t")
+
+		#Translating donors to origin genes
+		print("Translating to origin genes")
+		df = pd.merge(df, origin_table, how="left", left_on=TFBS_donor, right_on=origin_name)
+		df.rename(columns={TFBS_donor: "donor_name", origin_gene: "donor_id", origin_name: "origin_table_" + origin_name}, inplace=True)
+
+		#Check if donor could be translated -> otherwise it cannot be included in network
+		not_found = df[df["donor_id"].isnull()]
+		if not_found.shape[0] > 0:
+			print("- Could not find origin gene id for motif {0}".format(not_found["donor_name"][0]))
+			continue 
+
+		#Translating recipient ids to names
+		df = pd.merge(df, origin_table, how="left", left_on=TFBS_recipient, right_on=origin_gene)
+		df.rename(columns={origin_name: "recipient_name", origin_gene: "recipient_id"}, inplace=True)
+
+		#Only include recipients which are also TFS
+		print("- Total of {0} sites".format(df.shape[0]))
+		df = df[df["recipient_name"].notnull()]
+		print("- {0} sites have TF targets".format(df.shape[0]))
+
+		#Find conditions
+		bound_columns =  [col for col in df.columns if "_bound" in col]
+		condition_names = [col.replace("_bound", "").lower() for col in bound_columns]
+
+		#Remove sites not bound in any condition
+		print("- Removing unbound sites")
+		df = df[df[bound_columns].any(axis=1)]
+
+		#Rename bound_columns and create "donor_bound_in"
+		print("- List of bound conditions")
+		df.rename(columns=dict(zip(bound_columns, condition_names)), inplace=True)
+		df["donor_bound_in"] = df[condition_names].apply(lambda x: ",".join(x.index[x == 1]), axis=1)
+
+		#Select columns
+		selected = ["TFBS_chr", "TFBS_start", "TFBS_end", "donor_name", "donor_id", "recipient_name", "recipient_id", "donor_bound_in"]
+		df = df[selected]
+
+		#Add to list
+		dataframes.append(df)
+
+	print("Joining dataframes")
+	sites = pd.concat(dataframes)
+	print("Total of {0} sites found".format(sites.shape))
+
+
+	#------------------------------------- Expression info to subset edges -----------------------------------#
+
+	print("Reading expression data")
+	#Read expression values
+	expression_threshold = 5
+	expression_dict = {}
+	if args.expression != None:
+		expression_table = pd.read_csv(args.expression, sep="\t", index_col=0)
+		expression_table.columns = [col.lower() for col in expression_table.columns]
+
+		#Add expression of target genes
+		sites = pd.merge(sites, expression_table, how="left", left_on="recipient_id", right_index=True)
+
+		#set null columns to 0
+		sites.fillna(0, inplace=True)
+		sites["recipient_expressed_in"] = sites[expression_table.columns].apply(lambda x: ",".join(x.index[x > expression_threshold]), axis=1)
+		sites.drop(columns=expression_table.columns, inplace=True)
+
+
+	#--------------------------------------------#
+
+	#Subset edges on those where donor_bound in is part of recipient_expressed_in
+	sites["donor_bound_in"] = sites["donor_bound_in"].apply(lambda x: x.split(","))
+	sites["recipient_expressed_in"] = sites["recipient_expressed_in"].apply(lambda x: x.split(","))
+
+	#Expand the bound_in columns
+	exploded = sites["donor_bound_in"].apply(pd.Series).stack().reset_index().rename(columns={0:"donor_bound_in_exploded"})
+	sites = pd.merge(sites, exploded, left_index=True, right_on="level_0", how="left").drop(columns=["level_0", "level_1", "donor_bound_in"]).rename(columns={"donor_bound_in_exploded":"donor_bound_in"})
+
+	print(sites.shape[0])
+	sites = sites[sites.apply(lambda x: x["donor_bound_in"] in x["recipient_expressed_in"], axis=1)]
+	print(sites.shape[0])
+	print(sites)
+
+	##### Write out edges
+	edges_f = os.path.join(args.output, "edges.txt")
+	sites.to_csv(edges_f, sep="\t", index=False)
+
+	###### Create adjacency list ####
+	print("Creating adjacency mat")
+	all_donor_ids = set(list(sites["donor_id"]))
+
+	adjacency = {donor: {"targets":{}} for donor in all_donor_ids}
+
+	for index, row in sites.iterrows():
+
+		donor, recipient, bound_in = row["donor_id"], row["recipient_id"], row["donor_bound_in"]
+		if recipient not in adjacency[donor]["targets"]:
+			adjacency[donor]["targets"][recipient] = {"bound_in": []}
+
+		#adjacency[donor]["targets"]adjacency[donor]["targets"].get(recipient, {recipient: {"bound_in":[]}})
+		#if "bound_in" not in adjacency[donor]["targets"][recipient]:
+		#	adjacency[donor]["targets"][recipient]["bound_in"] = []
+
+		adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in)
+
+	#Create possible paths through graph
+	paths_f = os.path.join(args.output, "paths.txt")
+	paths_out = open(paths_f, "w")
+	paths_out.write("Regulatory_path\tn_nodes\tn_timepoints\n")
+	for gene_id in all_donor_ids:
+		print("paths starting from {0}".format(gene_id))
+
+		paths = dfs(adjacency = adjacency, path = [gene_id], timeline = [-1])	#first node must be bound in first condition
+
+		print("Writing paths")
+		for i, path in enumerate(paths["paths"]):
+
+			timeline = paths["timelines"][i]
+
+			#String formatting of path
+			str_paths = ""
+			for i, node in enumerate(path[:-1]):
+				str_paths += "{0} --({1})--> ".format(id2name[node][0], conditions[timeline[i+1]])
+			str_paths += id2name[path[-1]][0]
+
+			#Number of nodes
+			n_nodes = len(path)
+			n_timepoints = len(set(timeline)) - 1 #-1 for the -1-entry in timeline
+			paths_out.write("{0}\t{1}\t{2}\n".format(str_paths, n_nodes, n_timepoints))
+
+	paths_out.close()
+
+	#Sort paths
+	paths_sorted_f = os.path.join(args.output, "paths_sorted.txt")
+	call = "sort -k2,2nr -k3,3nr -t$'\\t' {0} | uniq -u > {1}".format(paths_f, paths_sorted_f)
+	print(call)
+	os.system(call)
+
+	print("done")
+