Added check or number of available cores for multiprocessing

loosolab · Apr 26, 2019 · ea51ea9 · ea51ea9
1 parent efc5c99
commit ea51ea9
Show file tree

Hide file tree

Showing 8 changed files with 113 additions and 20 deletions.
diff --git a/tobias/footprinting/ATACorrect.py b/tobias/footprinting/ATACorrect.py
@@ -372,6 +372,7 @@ def run_atacorrect(args):
 
 	#Start correction/write cores
 	n_bigwig = len(key2file.values())
+	args.cores = check_cores(args.cores, logger)
 	writer_cores = min(n_bigwig, max(1,int(args.cores*0.1)))	#at most one core per bigwig or 10% of cores (or 1)
 	worker_cores = max(1, args.cores - writer_cores) 				
 	logger.debug("Worker cores: {0}".format(worker_cores))

diff --git a/tobias/footprinting/BINDetect.py b/tobias/footprinting/BINDetect.py
@@ -137,6 +137,7 @@ def run_bindetect(args):
 	logger.output_files(outfiles)
 
 	# Setup pool
+	args.cores = check_cores(args.cores, logger)
 	writer_cores = max(1, int(args.cores*0.1))
 	worker_cores = max(1, args.cores - writer_cores)
 	logger.debug("Worker cores: {0}".format(worker_cores))
@@ -267,6 +268,7 @@ def run_bindetect(args):
 	#Get threshold for motifs
 	logger.debug("Getting match threshold per motif")
 	outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.motif_pvalue])) 
+
 	motif_list = MotifList(outlist)	
 	for motif in motif_list:
 		logger.debug("Motif {0}: threshold {1}".format(motif.name, motif.threshold))

diff --git a/tobias/footprinting/footprint_scores.py b/tobias/footprinting/footprint_scores.py
@@ -193,6 +193,7 @@ def run_footprinting(args):
 	regions_chunks = regions.chunks(args.split)
 
 	#Setup pools
+	args.cores = check_cores(args.cores, logger)
 	writer_cores = 1	
 	worker_cores = max(1, args.cores - writer_cores)
 	logger.debug("Worker cores: {0}".format(worker_cores))

diff --git a/tobias/misc/create_network.py b/tobias/misc/create_network.py
@@ -38,7 +38,7 @@ def add_network_arguments(parser):
 
 	#Required arguments
 	required = parser.add_argument_group('Required arguments')
-	required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect") #coordinates in .bed format with TF name in 4th column and a column containing target genes (set using --target)")
+	required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect")
 	required.add_argument('--origin', metavar="", help="File of origins of TF to genes")
 
 	#Optional arguments
@@ -53,7 +53,6 @@ def add_network_arguments(parser):
 	return(parser)
 
 #--------------------------------------------------------------------------------#
-
 def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}):
 	last_node = path[-1] 
 	if last_node in adjacency:
@@ -65,12 +64,15 @@ def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}):
 					pass  #next target without saving path
 				else:
 					#if any time overlaps with this or next stage
-					allowed = set([timeline[-1], timeline[-1] +1])
+					allowed = set([timeline[-1], timeline[-1]+1])
 					observed = adjacency[last_node]["targets"][target_node]["bound_in"]
-					if len(allowed.intersection(observed)) > 0:
-						new_path = path + [target_node]
-						new_timeline = timeline + [min(observed)]
-						all_paths = dfs(adjacency, new_path, new_timeline, all_paths)
+					observed_in_allowed = allowed.intersection(observed)
+					if len(observed_in_allowed) > 0:
+						current_timepoint = max(observed_in_allowed)
+						if timeline.count(current_timepoint) < 2: #only two binding events in the same timepoint is possible
+							new_path = path + [target_node]
+							new_timeline = timeline + [current_timepoint]
+							all_paths = dfs(adjacency, new_path, new_timeline, all_paths)
 		else:
 			if len(path) > 2:
 				#print("no targets for {0} - ending path".format(last_node))
@@ -123,9 +125,10 @@ def run_network(args):
 		print("Subset to {0} files".format(len(overview_files)))
 
 	#Read all edges to table
+	#todo: read in parallel
 	print("Reading all overview tables")
 	dataframes = []
-	for fil in overview_files[:5]:
+	for fil in overview_files[:40]:
 		print(fil)
 
 		df = pd.read_csv(fil, sep="\t")
@@ -179,8 +182,7 @@ def run_network(args):
 
 	print("Reading expression data")
 	#Read expression values
-	expression_threshold = 5
-	expression_dict = {}
+	expression_threshold = 50
 	if args.expression != None:
 		expression_table = pd.read_csv(args.expression, sep="\t", index_col=0)
 		expression_table.columns = [col.lower() for col in expression_table.columns]
@@ -192,10 +194,20 @@ def run_network(args):
 		sites.fillna(0, inplace=True)
 		sites["recipient_expressed_in"] = sites[expression_table.columns].apply(lambda x: ",".join(x.index[x > expression_threshold]), axis=1)
 		sites.drop(columns=expression_table.columns, inplace=True)
+		expression_dict = expression_table.to_dict()
+	else:
+		#All are expressed in all conditions
+		all_conditions = ",".join(condition_names)
+		sites["recipient_expressed_in"] = all_conditions
 
+	#expression_table = "" 		#rows, cols, values > expression_threshold
+	expression_dict = expression_table.to_dict()
 
 	#--------------------------------------------#
 
+	print(condition_names)
+	conditions = condition_names
+
 	#Subset edges on those where donor_bound in is part of recipient_expressed_in
 	sites["donor_bound_in"] = sites["donor_bound_in"].apply(lambda x: x.split(","))
 	sites["recipient_expressed_in"] = sites["recipient_expressed_in"].apply(lambda x: x.split(","))
@@ -207,7 +219,7 @@ def run_network(args):
 	print(sites.shape[0])
 	sites = sites[sites.apply(lambda x: x["donor_bound_in"] in x["recipient_expressed_in"], axis=1)]
 	print(sites.shape[0])
-	print(sites)
+	#print(sites)
 
 	##### Write out edges
 	edges_f = os.path.join(args.output, "edges.txt")
@@ -218,18 +230,20 @@ def run_network(args):
 	all_donor_ids = set(list(sites["donor_id"]))
 
 	adjacency = {donor: {"targets":{}} for donor in all_donor_ids}
-
 	for index, row in sites.iterrows():
-
 		donor, recipient, bound_in = row["donor_id"], row["recipient_id"], row["donor_bound_in"]
 		if recipient not in adjacency[donor]["targets"]:
 			adjacency[donor]["targets"][recipient] = {"bound_in": []}
 
-		#adjacency[donor]["targets"]adjacency[donor]["targets"].get(recipient, {recipient: {"bound_in":[]}})
-		#if "bound_in" not in adjacency[donor]["targets"][recipient]:
-		#	adjacency[donor]["targets"][recipient]["bound_in"] = []
-
-		adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in)
+		if bound_in not in adjacency[donor]["targets"][recipient]["bound_in"]:
+			adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in)
+
+	#Convert donor_bound_in to integer timeline
+	#print(adjacency)
+	for donor in adjacency:
+		for recipient in adjacency[donor]["targets"]:
+			adjacency[donor]["targets"][recipient]["bound_in"] = [condition_names.index(cond) for cond in adjacency[donor]["targets"][recipient]["bound_in"]]
+	#print(adjacency)
 
 	#Create possible paths through graph
 	paths_f = os.path.join(args.output, "paths.txt")
@@ -248,7 +262,8 @@ def run_network(args):
 			#String formatting of path
 			str_paths = ""
 			for i, node in enumerate(path[:-1]):
-				str_paths += "{0} --({1})--> ".format(id2name[node][0], conditions[timeline[i+1]])
+				in_condition = conditions[timeline[i+1]]
+				str_paths += "{0} --(Bound: {1}, target_expr: {2})--> ".format(id2name[node][0], expression_dict[node], conditions[in_condition])
 			str_paths += id2name[path[-1]][0]
 
 			#Number of nodes

diff --git a/tobias/motifs/tfbscan.py b/tobias/motifs/tfbscan.py
@@ -75,7 +75,7 @@ def motif_scanning(regions, args, motifs_obj):
 		region_TFBS = motifs_obj.scan_sequence(seq, region)		#Scan sequence
 
 		#Check format of region chromosome and convert sites if needed
-		m = re.match("(.+)\:([0-9]+)\-([0-9]+)\s+.+", region.chrom)
+		m = re.match(r"(.+)\:([0-9]+)\-([0-9]+)\s+.+", region.chrom)
 		if m:
 			reg_chrom, reg_start, reg_end = m.group(1), m.group(2), m.group(3)
 			for TFBS in region_TFBS:

diff --git a/tobias/utils/filter_important_factors.py b/tobias/utils/filter_important_factors.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+import pandas as pd
+from pandas import ExcelWriter
+from pandas import ExcelFile
+from argparse import ArgumentParser
+import heapq 
+
+#--------------------------------------------------------------------------------------------------------------#
+def get_important(args):
+
+    file = args.file_in         #input file bindetect_result.txt
+    filter = args.filter        #filter how many binding factors of every condition will be selected
+    file_out = args.file_out    #name of output output file
+    list_file = []              #contains all lines of file
+    new_file = []               #list for the filtered file
+
+    with open(file) as f:   #open bindetect results
+        for i in f:
+            i = i.strip()
+            i = i.split('\t')   #read file tab sapareted
+            list_file.append(i)
+
+    index_list = [list_file[0].index(i) for i in list_file[0] if '_change' in i]#get the indexs of the columens
+
+    important_values = [[max(heapq.nsmallest(filter,[float(a[i]) for a in list_file[1:]])), min(heapq.nlargest(filter,[float(a[i]) for a in list_file[1:]]))]  for i in index_list]
+
+    #important_values contains the maximum and minimum value of the bindingfactor
+    for i in list_file[1:]:
+        for a,b in zip(index_list, important_values):
+            if float(i[a]) >= float(max(b)) or float(i[a]) <= float(min(b)):    #filters if binding value is important
+                new_file.append(i)                                              #important lines get append to new list
+                print(i[0])                                                     #print stdout for nextflowpipeline
+                break                                                           #if line is added for loop jumps to next line 
+
+    #build new tab seperater text file 
+    book = {i:[] for i in list_file[0]}
+    [[book[key].append(value) for key,value in zip(list_file[0], i)] for i in new_file]   #dict for exele wirter key first line value hole line
+    df = pd.DataFrame(book)
+    df.to_csv(file_out , '\t', index=False)
+#--------------------------------------------------------------------------------------------------------#
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("-in", dest="file_in", type=str,   help="Input file")
+    parser.add_argument("-filter", dest="filter", type=int,   help="Filter")
+    parser.add_argument("-o", dest="file_out", type=str, help="Output file")
+    args = parser.parse_args()
+
+    get_important(args)
+
+#--------------------------------------------------------------------------------------------------------#
+
+if __name__ == '__main__':
+    main()
+
+
+
+
+
+
+
+
+
diff --git a/tobias/utils/motifs.py b/tobias/utils/motifs.py
@@ -17,6 +17,7 @@
 import MOODS.parsers
 
 import matplotlib as mpl
+mpl.use('Agg')
 import matplotlib.pyplot as plt
 from matplotlib.text import TextPath
 from matplotlib.patches import PathPatch

diff --git a/tobias/utils/utilities.py b/tobias/utils/utilities.py
@@ -28,6 +28,16 @@
 #-------------------------------------------------------------------------------------------#
 #----------------------------------- Multiprocessing ---------------------------------------#
 #-------------------------------------------------------------------------------------------#
+def check_cores(given_cores, logger):
+	""" Checks number of available cores and sets the used cores to <= available cores """
+
+	available_cores = mp.cpu_count()
+	if given_cores > available_cores:
+		logger.warning("Number of available cores is {0} but \'--cores\' is set to {1}. Setting \'--cores\' to {0}.\n".format(available_cores, given_cores))
+		return(available_cores)
+	else:
+		return(given_cores)
+
 
 def run_parallel(FUNC, input_chunks, arguments, n_cores, logger):
 	"""