From 02c98f81dc7414c16801b296c9cf73e1df260721 Mon Sep 17 00:00:00 2001
From: msbentsen <mette.bentsen@mpi-bn.mpg.de>
Date: Fri, 9 Aug 2019 10:44:44 +0200
Subject: [PATCH] Progress on network; cleaned up input & output

---
 tobias/misc/create_network.py | 289 +++++++++++++---------------------
 1 file changed, 106 insertions(+), 183 deletions(-)

diff --git a/tobias/misc/create_network.py b/tobias/misc/create_network.py
index 95d26bd..de5a393 100644
--- a/tobias/misc/create_network.py
+++ b/tobias/misc/create_network.py
@@ -25,6 +25,7 @@
 from tobias.utils.logger import *
 
 
+#--------------------------------------------------------------------------------#
 #--------------------------------------------------------------------------------#
 #--------------------------------------------------------------------------------#
 
@@ -38,51 +39,48 @@ def add_network_arguments(parser):
 	
 	#Required arguments
 	required = parser.add_argument_group('Required arguments')
-	required.add_argument('--TFBS', metavar="", help="File(s) containing TFBS to create network from")
+	required.add_argument('--TFBS', metavar="", help="File(s) containing TFBS to create network from", nargs="*")
 	required.add_argument('--origin', metavar="", help="File containing gene origins of TF <-> gene")
 
 	#Optional arguments
 	optional = parser.add_argument_group('Optional arguments')
-	optional.add_argument('--subset', metavar="", help="File containing subset of names to filter on")
-	optional.add_argument('--TFBS_columns', metavar="", nargs="*", help="Source TF -> target gene columns", default=["TFBS_name", "gene_id"])
-	optional.add_argument('--origin_columns', metavar="", nargs="*", help="Source TF, source gene", default=["TOBIAS_motif_id", "ensembl_gene_id"])
-
+	optional.add_argument('--start', metavar="", help="Name of node to start in")
+	optional.add_argument('--max-len', default=3)
 	#optional.add_argument('--additional', metavar="", help="Additional information on genes to add; for example RNA-seq")
+	#optional.add_argument('--subset', metavar="", help="File containing subset of names to filter on")
+	#optional.add_argument('--TFBS_columns', metavar="", nargs="*", help="Source TF -> target gene columns", default=["TFBS_name", "gene_id"])
+	#optional.add_argument('--origin_columns', metavar="", nargs="*", help="Source TF, source gene", default=["TOBIAS_motif_id", "ensembl_gene_id"])
+
 	optional.add_argument('--output', metavar="", help="Path to output directory (default: tobias_network)", default="tobias_network") 
 
 	return(parser)
 
 #--------------------------------------------------------------------------------#
-def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}):
+def dfs(adjacency, path, all_paths = [], options={"max_length":4}):
+
+
 	last_node = path[-1] 
 	if last_node in adjacency:
 		target_nodes = adjacency[last_node].get("targets", [])
 		if len(target_nodes) > 0:
+
+			#Go through all targets and get paths
 			for target_node in target_nodes:
-				#print("node {0} has target node {1}".format(last_node, target_node))
 				if target_node in path:    #cyclic
-					pass  #next target without saving path
+					new_path = path + [target_node]
+					all_paths += [new_path]
 				else:
-					#if any time overlaps with this or next stage
-					allowed = set([timeline[-1], timeline[-1]+1])
-					observed = adjacency[last_node]["targets"][target_node]["bound_in"]
-					observed_in_allowed = allowed.intersection(observed)
-					if len(observed_in_allowed) > 0:
-						current_timepoint = max(observed_in_allowed)
-						if timeline.count(current_timepoint) < 2: #only two binding events in the same timepoint is possible
-							new_path = path + [target_node]
-							new_timeline = timeline + [current_timepoint]
-							all_paths = dfs(adjacency, new_path, new_timeline, all_paths)
+					new_path = path + [target_node]
+					
+					if len(new_path) == options["max_length"]:
+						all_paths += [new_path]			#Save current path
+					else:
+						all_paths = dfs(adjacency, new_path, all_paths, options)		#Find targets of last node in path
 		else:
-			if len(path) > 2:
-				#print("no targets for {0} - ending path".format(last_node))
-				all_paths["paths"] += [path] 
-				all_paths["timelines"] += [timeline]
+			all_paths += [path] 
+
 	else:
-		if len(path) > 2:
-			#print("{0} doesnt have targets".format(last_node))
-			all_paths["paths"] += [path]
-			all_paths["timelines"] += [timeline]
+		all_paths += [path]
 
 	return all_paths
 
@@ -97,204 +95,129 @@ def run_network(args):
 	#translation file, where one motif can constitute more than one gene (jun::fos) 
 	#and more genes can encode transcription factors with same motifs (close family members with same target sequence)
 	origin_table = pd.read_csv(args.origin, sep="\t")
-	#origin_table = origin_table[args.origin_columns]
-	
 	print(origin_table)
 
-	TFBS_donor, TFBS_recipient = args.TFBS_columns
-	origin_name, origin_gene = args.origin_columns
+	#------------------------ Transcription factor binding sites with mapping to target genes -----------------------#
 
-	#id2name = {gene_id: names_list for gene_id, names_list in origin_table.groupby(origin_gene)[origin_name].apply(list).iteritems()}
-	
-	#---------------------------------------------- BINDetect results --------------------------------------------#
-
-	#Get all overview files from TFBS dir
-	print("Getting files from {0}".format(args.TFBS))
-	TF_folders = list(os.walk(args.TFBS))[0][1]													#list of folder names = TF names
-	overview_files = [f for f in glob.glob(args.TFBS + "/*/*_overview.txt")]
-	print("- Found results from {0} motifs".format(len(overview_files)))
-
-	#Subset on names
-	if args.subset != None:
-		print("Subsetting on names from {0}".format(args.subset))
-		subset_names = open(args.subset).read().rstrip().split()
-		print("Subset: {0}".format(subset_names))
-
-		overview_files = list(set(sum(match_lists([subset_names, overview_files])[0], [])))
-		print("Subset to {0} files".format(len(overview_files)))
-
-	#Read all edges to table
-	#find donor col in origin_table
-	#origin_name_col = 
+	print("Reading all binding sites to one file")
 	#todo: read in parallel
-	#print(overview_files)
-
-	print("Reading all overview tables")
 	dataframes = []
-	for fil in overview_files[:30]:
+	for fil in args.TFBS:
 		print("- {0}".format(fil))
 
-		df = pd.read_csv(fil, sep="\t")
-
-		#Translating donors to origin genes
-		print("Translating to origin genes")
-		df = pd.merge(df, origin_table, how="left", left_on=TFBS_donor, right_on=origin_name)
-		df.rename(columns={TFBS_donor: "donor_name", origin_gene: "donor_id", origin_name: "origin_table_" + origin_name}, inplace=True)
-
-		#Check if donor could be translated -> otherwise it cannot be included in network
-		not_found = df[df["donor_id"].isnull()]
-		if not_found.shape[0] > 0:
-			print("- Could not find origin gene id for motif {0}".format(not_found["donor_name"][0]))
-			continue 
-
-		#Translating recipient ids to names
-		df = pd.merge(df, origin_table, how="left", left_on=TFBS_recipient, right_on=origin_gene)
-		df.rename(columns={origin_name: "recipient_name", origin_gene: "recipient_id"}, inplace=True)
-
-		#Only include recipients which are also TFS
-		print("- Total of {0} sites".format(df.shape[0]))
-		df = df[df["recipient_name"].notnull()]
-		print("- {0} sites have TF targets".format(df.shape[0]))
-
-		#Find conditions
-		bound_columns =  [col for col in df.columns if "_bound" in col]
-		condition_names = [col.replace("_bound", "").lower() for col in bound_columns]
-
-		#Remove sites not bound in any condition
-		print("- Removing unbound sites")
-		df = df[df[bound_columns].any(axis=1)]
-
-		#Rename bound_columns and create "donor_bound_in"
-		print("- List of bound conditions")
-		df.rename(columns=dict(zip(bound_columns, condition_names)), inplace=True)
-		df["donor_bound_in"] = df[condition_names].apply(lambda x: ",".join(x.index[x == 1]), axis=1)
-
-		#Select columns
-		#selected = ["TFBS_chr", "TFBS_start", "TFBS_end", "donor_name", "donor_id", "recipient_name", "recipient_id", "donor_bound_in"]
-		#df = df[selected]
-
-		#Add to list
+		df = pd.read_csv(fil, sep="\t", header=None)
 		dataframes.append(df)
 
 	print("Joining dataframes")
-	sites = pd.concat(dataframes)
-	print("Total of {0} sites found".format(sites.shape))
-
-
-	#------------------------------------- Expression info to subset edges -----------------------------------#
-	"""
-	print("Reading expression data")
-	#Read expression values
-	expression_threshold = 50
-	if args.expression != None:
-		expression_table = pd.read_csv(args.expression, sep="\t", index_col=0)
-		expression_table.columns = [col.lower() for col in expression_table.columns]
-
-		#Add expression of target genes
-		sites = pd.merge(sites, expression_table, how="left", left_on="recipient_id", right_index=True)
-
-		#set null columns to 0
-		sites.fillna(0, inplace=True)
-		sites["recipient_expressed_in"] = sites[expression_table.columns].apply(lambda x: ",".join(x.index[x > expression_threshold]), axis=1)
-		sites.drop(columns=expression_table.columns, inplace=True)
-		expression_dict = expression_table.to_dict()
-	else:
-		#All are expressed in all conditions
-		all_conditions = ",".join(condition_names)
-		sites["recipient_expressed_in"] = all_conditions
+	sites_table = pd.concat(dataframes, sort=False)
+	print("Total of {0} sites found".format(sites_table.shape[0]))
 
-	#expression_table = "" 		#rows, cols, values > expression_threshold
-	expression_dict = expression_table.to_dict()
-	"""
 
-	#--------------------------------------------#
+	#------------------------------------- Match target columns to origin ------------------------------------#
+	print("Matching target genes back to TFs")
+	origin_table_str_columns = list(origin_table.dtypes.index[origin_table.dtypes == "object"])
+	sites_table_str_columns = list(sites_table.dtypes.index[sites_table.dtypes == "object"])
 
+	origin_table = origin_table.apply(lambda x: x.astype(str).str.upper())
+	sites_table = sites_table.apply(lambda x: x.astype(str).str.upper())
 
+	#Establishing matched columns
+	matching = []
+	for sites_column in sites_table_str_columns:		
+		sites_column_content = set(sites_table[sites_column])
 
-	all_source_names = set(sites["donor_name"])
-	print(all_source_names)
+		for origin_column in origin_table_str_columns:
+			origin_column_content = set(origin_table[origin_column])
 
-	print(sites.shape[0])
+			#Overlap
+			overlap = len(origin_column_content & sites_column_content)
+			matching.append((sites_column, origin_column, overlap))
 
-	sites = sites[(sites["recipient_name"].isin(all_source_names))]
-	print(sites.shape[0])
+	sorted_matching = sorted(matching, key = lambda tup: -tup[-1])
 
-	#--------------------------------------------#
+	#Columns for matching
+	source_id = 3
+	source_id_origin = [match[1] for match in sorted_matching if match[0] == 3][0] 	#source id in the origin table
+	target_id = [match[0] for match in sorted_matching if match[0] != 3][0]			#target id in from sites
+	target_id_origin = [match[1] for match in sorted_matching if match[0] != 3][0]	#target id in the origin table
 
+	print(source_id_origin)
+	print(target_id)
+	print(target_id_origin)
+	
+	#Intersect of sources and targets
+	all_source_ids = set(sites_table[source_id])
+	all_target_ids = set(origin_table[source_id_origin])
+	common_ids = all_source_ids & all_target_ids
+	print(common_ids)
 
-	print(condition_names)
-	conditions = condition_names
+	#Add target motif id to sites
+	origin_table_sub = origin_table[origin_table[source_id_origin].isin(common_ids)]
+	sites_table_convert = sites_table.merge(origin_table_sub[[source_id_origin, target_id_origin]], left_on=target_id, right_on=target_id_origin)
 
-	#Subset edges on those where donor_bound in is part of recipient_expressed_in
-	sites["donor_bound_in"] = sites["donor_bound_in"].apply(lambda x: x.split(","))
-	#sites["recipient_expressed_in"] = sites["recipient_expressed_in"].apply(lambda x: x.split(","))
 
-	#Expand the bound_in columns
-	exploded = sites["donor_bound_in"].apply(pd.Series).stack().reset_index().rename(columns={0:"donor_bound_in_exploded"})
-	sites = pd.merge(sites, exploded, left_index=True, right_on="level_0", how="left").drop(columns=["level_0", "level_1", "donor_bound_in"]).rename(columns={"donor_bound_in_exploded":"donor_bound_in"})
+	#--------------------- --------------------#
 
-	#print(sites.shape[0])
-	#sites = sites[sites.apply(lambda x: x["donor_bound_in"] in x["recipient_expressed_in"], axis=1)]
-	#print(sites.shape[0])
-	#print(sites)
-	
-	##### Write out edges
+	##### Write out edges #####
 	edges_f = os.path.join(args.output, "edges.txt")
-	sites.to_csv(edges_f, sep="\t", index=False)
+	sites_table_convert.to_csv(edges_f, sep="\t", index=False)
 
 	###### Create adjacency list ####
 	print("Creating adjacency mat")
-	all_donor_ids = set(list(sites["donor_id"]))
-
-	adjacency = {donor: {"targets":{}} for donor in all_donor_ids}
-	for index, row in sites.iterrows():
-		donor, recipient, bound_in = row["donor_id"], row["recipient_id"], row["donor_bound_in"]
-		if recipient not in adjacency[donor]["targets"]:
-			adjacency[donor]["targets"][recipient] = {"bound_in": []}
-
-		if bound_in not in adjacency[donor]["targets"][recipient]["bound_in"]:
-			adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in)
-
-	#Convert donor_bound_in to integer timeline
-	#print(adjacency)
-	for donor in adjacency:
-		for recipient in adjacency[donor]["targets"]:
-			adjacency[donor]["targets"][recipient]["bound_in"] = [condition_names.index(cond) for cond in adjacency[donor]["targets"][recipient]["bound_in"]]
-	#print(adjacency)
+	adjacency = {source: {"targets":[]} for source in common_ids}
+	for index, row in sites_table_convert.iterrows():
+		source, target = row[source_id], row[source_id_origin]
+		if target not in adjacency[source]["targets"]:
+			adjacency[source]["targets"].append(target)
+
+	print("Writing out adjacency list")
+	adjacency_f = os.path.join(args.output, "adjacency.txt")
+	with open(adjacency_f, "w") as f:
+		for source in sorted(adjacency):
+			f.write("{0}\t{1}\n".format(source, ", ".join(adjacency[source]["targets"])))
 
 	#Create possible paths through graph
+	print("Create possible paths through graph")
 	paths_f = os.path.join(args.output, "paths.txt")
 	paths_out = open(paths_f, "w")
 	paths_out.write("Regulatory_path\tn_nodes\tn_timepoints\n")
-	for gene_id in all_donor_ids:
-		print("paths starting from {0}".format(gene_id))
 
-		paths = dfs(adjacency = adjacency, path = [gene_id], timeline = [-1])	#first node must be bound in first condition
+	#Start node
+	if args.start != None:
+		start_nodes = [one_id for one_id in common_ids if args.start.upper() in one_id]
+		print("Starting nodes are: {0}".format(start_nodes))
+	else:
+		start_nodes = common_ids
+	
+	#Find paths per source_id
+	for source_id in start_nodes:
+		print("Paths starting from {0}".format(source_id))
+
+		paths = dfs(adjacency = adjacency, path = [source_id], options={"max_length": args.max_len})
 
 		print("Writing paths")
-		for i, path in enumerate(paths["paths"]):
+		path_edges = []
+		for path in paths:
 			
-			timeline = paths["timelines"][i]
-
 			#String formatting of path
 			str_paths = ""
 			for i, node in enumerate(path[:-1]):
-				in_condition = conditions[timeline[i+1]]
-				str_paths += "{0} --(Bound: {1}, target_expr: {2})--> ".format(id2name[node][0], expression_dict[node], conditions[in_condition])
-			str_paths += id2name[path[-1]][0]
+				str_paths += "{0} --> ".format(node)
+			str_paths += path[-1]
 
-			#Number of nodes
 			n_nodes = len(path)
-			n_timepoints = len(set(timeline)) - 1 #-1 for the -1-entry in timeline
-			paths_out.write("{0}\t{1}\t{2}\n".format(str_paths, n_nodes, n_timepoints))
-
+			paths_out.write("{0}\t{1}\n".format(str_paths, n_nodes))
+
+			#Make pairwise edges across path
+			path_edges += ["\t".join([path[i], path[j], str(j)]) for i,j in zip(range(0, len(path)-1), range(1,len(path)))]
+		
+		source_paths_f = os.path.join(args.output, "{0}_path_edges.txt".format(source_id))
+		with open(source_paths_f, "w") as f:
+			f.write("Source\tTarget\tLevel\n")
+			for path in set(path_edges):
+				f.write(path + "\n")
+			f.close()
+		
 	paths_out.close()
 
-	#Sort paths
-	paths_sorted_f = os.path.join(args.output, "paths_sorted.txt")
-	call = "sort -k2,2nr -k3,3nr -t$'\\t' {0} | uniq -u > {1}".format(paths_f, paths_sorted_f)
-	print(call)
-	os.system(call)
-
 	print("done")
-