diff --git a/tobias/footprinting/ATACorrect.py b/tobias/footprinting/ATACorrect.py index 3dafff8..d01b70e 100644 --- a/tobias/footprinting/ATACorrect.py +++ b/tobias/footprinting/ATACorrect.py @@ -372,6 +372,7 @@ def run_atacorrect(args): #Start correction/write cores n_bigwig = len(key2file.values()) + args.cores = check_cores(args.cores, logger) writer_cores = min(n_bigwig, max(1,int(args.cores*0.1))) #at most one core per bigwig or 10% of cores (or 1) worker_cores = max(1, args.cores - writer_cores) logger.debug("Worker cores: {0}".format(worker_cores)) diff --git a/tobias/footprinting/BINDetect.py b/tobias/footprinting/BINDetect.py index a121556..ee6acfc 100644 --- a/tobias/footprinting/BINDetect.py +++ b/tobias/footprinting/BINDetect.py @@ -137,6 +137,7 @@ def run_bindetect(args): logger.output_files(outfiles) # Setup pool + args.cores = check_cores(args.cores, logger) writer_cores = max(1, int(args.cores*0.1)) worker_cores = max(1, args.cores - writer_cores) logger.debug("Worker cores: {0}".format(worker_cores)) @@ -267,6 +268,7 @@ def run_bindetect(args): #Get threshold for motifs logger.debug("Getting match threshold per motif") outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.motif_pvalue])) + motif_list = MotifList(outlist) for motif in motif_list: logger.debug("Motif {0}: threshold {1}".format(motif.name, motif.threshold)) diff --git a/tobias/footprinting/footprint_scores.py b/tobias/footprinting/footprint_scores.py index 87d8a92..8f521f0 100644 --- a/tobias/footprinting/footprint_scores.py +++ b/tobias/footprinting/footprint_scores.py @@ -193,6 +193,7 @@ def run_footprinting(args): regions_chunks = regions.chunks(args.split) #Setup pools + args.cores = check_cores(args.cores, logger) writer_cores = 1 worker_cores = max(1, args.cores - writer_cores) logger.debug("Worker cores: {0}".format(worker_cores)) diff --git a/tobias/misc/create_network.py b/tobias/misc/create_network.py index 50d4d70..9853c74 100644 --- a/tobias/misc/create_network.py +++ b/tobias/misc/create_network.py @@ -38,7 +38,7 @@ def add_network_arguments(parser): #Required arguments required = parser.add_argument_group('Required arguments') - required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect") #coordinates in .bed format with TF name in 4th column and a column containing target genes (set using --target)") + required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect") required.add_argument('--origin', metavar="", help="File of origins of TF to genes") #Optional arguments @@ -53,7 +53,6 @@ def add_network_arguments(parser): return(parser) #--------------------------------------------------------------------------------# - def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}): last_node = path[-1] if last_node in adjacency: @@ -65,12 +64,15 @@ def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}): pass #next target without saving path else: #if any time overlaps with this or next stage - allowed = set([timeline[-1], timeline[-1] +1]) + allowed = set([timeline[-1], timeline[-1]+1]) observed = adjacency[last_node]["targets"][target_node]["bound_in"] - if len(allowed.intersection(observed)) > 0: - new_path = path + [target_node] - new_timeline = timeline + [min(observed)] - all_paths = dfs(adjacency, new_path, new_timeline, all_paths) + observed_in_allowed = allowed.intersection(observed) + if len(observed_in_allowed) > 0: + current_timepoint = max(observed_in_allowed) + if timeline.count(current_timepoint) < 2: #only two binding events in the same timepoint is possible + new_path = path + [target_node] + new_timeline = timeline + [current_timepoint] + all_paths = dfs(adjacency, new_path, new_timeline, all_paths) else: if len(path) > 2: #print("no targets for {0} - ending path".format(last_node)) @@ -123,9 +125,10 @@ def run_network(args): print("Subset to {0} files".format(len(overview_files))) #Read all edges to table + #todo: read in parallel print("Reading all overview tables") dataframes = [] - for fil in overview_files[:5]: + for fil in overview_files[:40]: print(fil) df = pd.read_csv(fil, sep="\t") @@ -179,8 +182,7 @@ def run_network(args): print("Reading expression data") #Read expression values - expression_threshold = 5 - expression_dict = {} + expression_threshold = 50 if args.expression != None: expression_table = pd.read_csv(args.expression, sep="\t", index_col=0) expression_table.columns = [col.lower() for col in expression_table.columns] @@ -192,10 +194,20 @@ def run_network(args): sites.fillna(0, inplace=True) sites["recipient_expressed_in"] = sites[expression_table.columns].apply(lambda x: ",".join(x.index[x > expression_threshold]), axis=1) sites.drop(columns=expression_table.columns, inplace=True) + expression_dict = expression_table.to_dict() + else: + #All are expressed in all conditions + all_conditions = ",".join(condition_names) + sites["recipient_expressed_in"] = all_conditions + #expression_table = "" #rows, cols, values > expression_threshold + expression_dict = expression_table.to_dict() #--------------------------------------------# + print(condition_names) + conditions = condition_names + #Subset edges on those where donor_bound in is part of recipient_expressed_in sites["donor_bound_in"] = sites["donor_bound_in"].apply(lambda x: x.split(",")) sites["recipient_expressed_in"] = sites["recipient_expressed_in"].apply(lambda x: x.split(",")) @@ -207,7 +219,7 @@ def run_network(args): print(sites.shape[0]) sites = sites[sites.apply(lambda x: x["donor_bound_in"] in x["recipient_expressed_in"], axis=1)] print(sites.shape[0]) - print(sites) + #print(sites) ##### Write out edges edges_f = os.path.join(args.output, "edges.txt") @@ -218,18 +230,20 @@ def run_network(args): all_donor_ids = set(list(sites["donor_id"])) adjacency = {donor: {"targets":{}} for donor in all_donor_ids} - for index, row in sites.iterrows(): - donor, recipient, bound_in = row["donor_id"], row["recipient_id"], row["donor_bound_in"] if recipient not in adjacency[donor]["targets"]: adjacency[donor]["targets"][recipient] = {"bound_in": []} - #adjacency[donor]["targets"]adjacency[donor]["targets"].get(recipient, {recipient: {"bound_in":[]}}) - #if "bound_in" not in adjacency[donor]["targets"][recipient]: - # adjacency[donor]["targets"][recipient]["bound_in"] = [] - - adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in) + if bound_in not in adjacency[donor]["targets"][recipient]["bound_in"]: + adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in) + + #Convert donor_bound_in to integer timeline + #print(adjacency) + for donor in adjacency: + for recipient in adjacency[donor]["targets"]: + adjacency[donor]["targets"][recipient]["bound_in"] = [condition_names.index(cond) for cond in adjacency[donor]["targets"][recipient]["bound_in"]] + #print(adjacency) #Create possible paths through graph paths_f = os.path.join(args.output, "paths.txt") @@ -248,7 +262,8 @@ def run_network(args): #String formatting of path str_paths = "" for i, node in enumerate(path[:-1]): - str_paths += "{0} --({1})--> ".format(id2name[node][0], conditions[timeline[i+1]]) + in_condition = conditions[timeline[i+1]] + str_paths += "{0} --(Bound: {1}, target_expr: {2})--> ".format(id2name[node][0], expression_dict[node], conditions[in_condition]) str_paths += id2name[path[-1]][0] #Number of nodes diff --git a/tobias/motifs/tfbscan.py b/tobias/motifs/tfbscan.py index 7170df2..828ad86 100644 --- a/tobias/motifs/tfbscan.py +++ b/tobias/motifs/tfbscan.py @@ -75,7 +75,7 @@ def motif_scanning(regions, args, motifs_obj): region_TFBS = motifs_obj.scan_sequence(seq, region) #Scan sequence #Check format of region chromosome and convert sites if needed - m = re.match("(.+)\:([0-9]+)\-([0-9]+)\s+.+", region.chrom) + m = re.match(r"(.+)\:([0-9]+)\-([0-9]+)\s+.+", region.chrom) if m: reg_chrom, reg_start, reg_end = m.group(1), m.group(2), m.group(3) for TFBS in region_TFBS: diff --git a/tobias/utils/filter_important_factors.py b/tobias/utils/filter_important_factors.py new file mode 100644 index 0000000..e327447 --- /dev/null +++ b/tobias/utils/filter_important_factors.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +import pandas as pd +from pandas import ExcelWriter +from pandas import ExcelFile +from argparse import ArgumentParser +import heapq + +#--------------------------------------------------------------------------------------------------------------# +def get_important(args): + + file = args.file_in #input file bindetect_result.txt + filter = args.filter #filter how many binding factors of every condition will be selected + file_out = args.file_out #name of output output file + list_file = [] #contains all lines of file + new_file = [] #list for the filtered file + + with open(file) as f: #open bindetect results + for i in f: + i = i.strip() + i = i.split('\t') #read file tab sapareted + list_file.append(i) + + index_list = [list_file[0].index(i) for i in list_file[0] if '_change' in i]#get the indexs of the columens + + important_values = [[max(heapq.nsmallest(filter,[float(a[i]) for a in list_file[1:]])), min(heapq.nlargest(filter,[float(a[i]) for a in list_file[1:]]))] for i in index_list] + + #important_values contains the maximum and minimum value of the bindingfactor + for i in list_file[1:]: + for a,b in zip(index_list, important_values): + if float(i[a]) >= float(max(b)) or float(i[a]) <= float(min(b)): #filters if binding value is important + new_file.append(i) #important lines get append to new list + print(i[0]) #print stdout for nextflowpipeline + break #if line is added for loop jumps to next line + + #build new tab seperater text file + book = {i:[] for i in list_file[0]} + [[book[key].append(value) for key,value in zip(list_file[0], i)] for i in new_file] #dict for exele wirter key first line value hole line + df = pd.DataFrame(book) + df.to_csv(file_out , '\t', index=False) +#--------------------------------------------------------------------------------------------------------# + +def main(): + parser = ArgumentParser() + parser.add_argument("-in", dest="file_in", type=str, help="Input file") + parser.add_argument("-filter", dest="filter", type=int, help="Filter") + parser.add_argument("-o", dest="file_out", type=str, help="Output file") + args = parser.parse_args() + + get_important(args) + +#--------------------------------------------------------------------------------------------------------# + +if __name__ == '__main__': + main() + + + + + + + + + diff --git a/tobias/utils/motifs.py b/tobias/utils/motifs.py index b75f44b..60c5452 100644 --- a/tobias/utils/motifs.py +++ b/tobias/utils/motifs.py @@ -17,6 +17,7 @@ import MOODS.parsers import matplotlib as mpl +mpl.use('Agg') import matplotlib.pyplot as plt from matplotlib.text import TextPath from matplotlib.patches import PathPatch diff --git a/tobias/utils/utilities.py b/tobias/utils/utilities.py index 272c611..57bb242 100644 --- a/tobias/utils/utilities.py +++ b/tobias/utils/utilities.py @@ -28,6 +28,16 @@ #-------------------------------------------------------------------------------------------# #----------------------------------- Multiprocessing ---------------------------------------# #-------------------------------------------------------------------------------------------# +def check_cores(given_cores, logger): + """ Checks number of available cores and sets the used cores to <= available cores """ + + available_cores = mp.cpu_count() + if given_cores > available_cores: + logger.warning("Number of available cores is {0} but \'--cores\' is set to {1}. Setting \'--cores\' to {0}.\n".format(available_cores, given_cores)) + return(available_cores) + else: + return(given_cores) + def run_parallel(FUNC, input_chunks, arguments, n_cores, logger): """