Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Added check or number of available cores for multiprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
msbentsen committed Apr 26, 2019
1 parent efc5c99 commit ea51ea9
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 20 deletions.
1 change: 1 addition & 0 deletions tobias/footprinting/ATACorrect.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ def run_atacorrect(args):

#Start correction/write cores
n_bigwig = len(key2file.values())
args.cores = check_cores(args.cores, logger)
writer_cores = min(n_bigwig, max(1,int(args.cores*0.1))) #at most one core per bigwig or 10% of cores (or 1)
worker_cores = max(1, args.cores - writer_cores)
logger.debug("Worker cores: {0}".format(worker_cores))
Expand Down
2 changes: 2 additions & 0 deletions tobias/footprinting/BINDetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def run_bindetect(args):
logger.output_files(outfiles)

# Setup pool
args.cores = check_cores(args.cores, logger)
writer_cores = max(1, int(args.cores*0.1))
worker_cores = max(1, args.cores - writer_cores)
logger.debug("Worker cores: {0}".format(worker_cores))
Expand Down Expand Up @@ -267,6 +268,7 @@ def run_bindetect(args):
#Get threshold for motifs
logger.debug("Getting match threshold per motif")
outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.motif_pvalue]))

motif_list = MotifList(outlist)
for motif in motif_list:
logger.debug("Motif {0}: threshold {1}".format(motif.name, motif.threshold))
Expand Down
1 change: 1 addition & 0 deletions tobias/footprinting/footprint_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def run_footprinting(args):
regions_chunks = regions.chunks(args.split)

#Setup pools
args.cores = check_cores(args.cores, logger)
writer_cores = 1
worker_cores = max(1, args.cores - writer_cores)
logger.debug("Worker cores: {0}".format(worker_cores))
Expand Down
53 changes: 34 additions & 19 deletions tobias/misc/create_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def add_network_arguments(parser):

#Required arguments
required = parser.add_argument_group('Required arguments')
required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect") #coordinates in .bed format with TF name in 4th column and a column containing target genes (set using --target)")
required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect")
required.add_argument('--origin', metavar="", help="File of origins of TF to genes")

#Optional arguments
Expand All @@ -53,7 +53,6 @@ def add_network_arguments(parser):
return(parser)

#--------------------------------------------------------------------------------#

def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}):
last_node = path[-1]
if last_node in adjacency:
Expand All @@ -65,12 +64,15 @@ def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}):
pass #next target without saving path
else:
#if any time overlaps with this or next stage
allowed = set([timeline[-1], timeline[-1] +1])
allowed = set([timeline[-1], timeline[-1]+1])
observed = adjacency[last_node]["targets"][target_node]["bound_in"]
if len(allowed.intersection(observed)) > 0:
new_path = path + [target_node]
new_timeline = timeline + [min(observed)]
all_paths = dfs(adjacency, new_path, new_timeline, all_paths)
observed_in_allowed = allowed.intersection(observed)
if len(observed_in_allowed) > 0:
current_timepoint = max(observed_in_allowed)
if timeline.count(current_timepoint) < 2: #only two binding events in the same timepoint is possible
new_path = path + [target_node]
new_timeline = timeline + [current_timepoint]
all_paths = dfs(adjacency, new_path, new_timeline, all_paths)
else:
if len(path) > 2:
#print("no targets for {0} - ending path".format(last_node))
Expand Down Expand Up @@ -123,9 +125,10 @@ def run_network(args):
print("Subset to {0} files".format(len(overview_files)))

#Read all edges to table
#todo: read in parallel
print("Reading all overview tables")
dataframes = []
for fil in overview_files[:5]:
for fil in overview_files[:40]:
print(fil)

df = pd.read_csv(fil, sep="\t")
Expand Down Expand Up @@ -179,8 +182,7 @@ def run_network(args):

print("Reading expression data")
#Read expression values
expression_threshold = 5
expression_dict = {}
expression_threshold = 50
if args.expression != None:
expression_table = pd.read_csv(args.expression, sep="\t", index_col=0)
expression_table.columns = [col.lower() for col in expression_table.columns]
Expand All @@ -192,10 +194,20 @@ def run_network(args):
sites.fillna(0, inplace=True)
sites["recipient_expressed_in"] = sites[expression_table.columns].apply(lambda x: ",".join(x.index[x > expression_threshold]), axis=1)
sites.drop(columns=expression_table.columns, inplace=True)
expression_dict = expression_table.to_dict()
else:
#All are expressed in all conditions
all_conditions = ",".join(condition_names)
sites["recipient_expressed_in"] = all_conditions

#expression_table = "" #rows, cols, values > expression_threshold
expression_dict = expression_table.to_dict()

#--------------------------------------------#

print(condition_names)
conditions = condition_names

#Subset edges on those where donor_bound in is part of recipient_expressed_in
sites["donor_bound_in"] = sites["donor_bound_in"].apply(lambda x: x.split(","))
sites["recipient_expressed_in"] = sites["recipient_expressed_in"].apply(lambda x: x.split(","))
Expand All @@ -207,7 +219,7 @@ def run_network(args):
print(sites.shape[0])
sites = sites[sites.apply(lambda x: x["donor_bound_in"] in x["recipient_expressed_in"], axis=1)]
print(sites.shape[0])
print(sites)
#print(sites)

##### Write out edges
edges_f = os.path.join(args.output, "edges.txt")
Expand All @@ -218,18 +230,20 @@ def run_network(args):
all_donor_ids = set(list(sites["donor_id"]))

adjacency = {donor: {"targets":{}} for donor in all_donor_ids}

for index, row in sites.iterrows():

donor, recipient, bound_in = row["donor_id"], row["recipient_id"], row["donor_bound_in"]
if recipient not in adjacency[donor]["targets"]:
adjacency[donor]["targets"][recipient] = {"bound_in": []}

#adjacency[donor]["targets"]adjacency[donor]["targets"].get(recipient, {recipient: {"bound_in":[]}})
#if "bound_in" not in adjacency[donor]["targets"][recipient]:
# adjacency[donor]["targets"][recipient]["bound_in"] = []

adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in)
if bound_in not in adjacency[donor]["targets"][recipient]["bound_in"]:
adjacency[donor]["targets"][recipient]["bound_in"].append(bound_in)

#Convert donor_bound_in to integer timeline
#print(adjacency)
for donor in adjacency:
for recipient in adjacency[donor]["targets"]:
adjacency[donor]["targets"][recipient]["bound_in"] = [condition_names.index(cond) for cond in adjacency[donor]["targets"][recipient]["bound_in"]]
#print(adjacency)

#Create possible paths through graph
paths_f = os.path.join(args.output, "paths.txt")
Expand All @@ -248,7 +262,8 @@ def run_network(args):
#String formatting of path
str_paths = ""
for i, node in enumerate(path[:-1]):
str_paths += "{0} --({1})--> ".format(id2name[node][0], conditions[timeline[i+1]])
in_condition = conditions[timeline[i+1]]
str_paths += "{0} --(Bound: {1}, target_expr: {2})--> ".format(id2name[node][0], expression_dict[node], conditions[in_condition])
str_paths += id2name[path[-1]][0]

#Number of nodes
Expand Down
2 changes: 1 addition & 1 deletion tobias/motifs/tfbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def motif_scanning(regions, args, motifs_obj):
region_TFBS = motifs_obj.scan_sequence(seq, region) #Scan sequence

#Check format of region chromosome and convert sites if needed
m = re.match("(.+)\:([0-9]+)\-([0-9]+)\s+.+", region.chrom)
m = re.match(r"(.+)\:([0-9]+)\-([0-9]+)\s+.+", region.chrom)
if m:
reg_chrom, reg_start, reg_end = m.group(1), m.group(2), m.group(3)
for TFBS in region_TFBS:
Expand Down
63 changes: 63 additions & 0 deletions tobias/utils/filter_important_factors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
from argparse import ArgumentParser
import heapq

#--------------------------------------------------------------------------------------------------------------#
def get_important(args):

file = args.file_in #input file bindetect_result.txt
filter = args.filter #filter how many binding factors of every condition will be selected
file_out = args.file_out #name of output output file
list_file = [] #contains all lines of file
new_file = [] #list for the filtered file

with open(file) as f: #open bindetect results
for i in f:
i = i.strip()
i = i.split('\t') #read file tab sapareted
list_file.append(i)

index_list = [list_file[0].index(i) for i in list_file[0] if '_change' in i]#get the indexs of the columens

important_values = [[max(heapq.nsmallest(filter,[float(a[i]) for a in list_file[1:]])), min(heapq.nlargest(filter,[float(a[i]) for a in list_file[1:]]))] for i in index_list]

#important_values contains the maximum and minimum value of the bindingfactor
for i in list_file[1:]:
for a,b in zip(index_list, important_values):
if float(i[a]) >= float(max(b)) or float(i[a]) <= float(min(b)): #filters if binding value is important
new_file.append(i) #important lines get append to new list
print(i[0]) #print stdout for nextflowpipeline
break #if line is added for loop jumps to next line

#build new tab seperater text file
book = {i:[] for i in list_file[0]}
[[book[key].append(value) for key,value in zip(list_file[0], i)] for i in new_file] #dict for exele wirter key first line value hole line
df = pd.DataFrame(book)
df.to_csv(file_out , '\t', index=False)
#--------------------------------------------------------------------------------------------------------#

def main():
parser = ArgumentParser()
parser.add_argument("-in", dest="file_in", type=str, help="Input file")
parser.add_argument("-filter", dest="filter", type=int, help="Filter")
parser.add_argument("-o", dest="file_out", type=str, help="Output file")
args = parser.parse_args()

get_important(args)

#--------------------------------------------------------------------------------------------------------#

if __name__ == '__main__':
main()









1 change: 1 addition & 0 deletions tobias/utils/motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import MOODS.parsers

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.text import TextPath
from matplotlib.patches import PathPatch
Expand Down
10 changes: 10 additions & 0 deletions tobias/utils/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@
#-------------------------------------------------------------------------------------------#
#----------------------------------- Multiprocessing ---------------------------------------#
#-------------------------------------------------------------------------------------------#
def check_cores(given_cores, logger):
""" Checks number of available cores and sets the used cores to <= available cores """

available_cores = mp.cpu_count()
if given_cores > available_cores:
logger.warning("Number of available cores is {0} but \'--cores\' is set to {1}. Setting \'--cores\' to {0}.\n".format(available_cores, given_cores))
return(available_cores)
else:
return(given_cores)


def run_parallel(FUNC, input_chunks, arguments, n_cores, logger):
"""
Expand Down

0 comments on commit ea51ea9

Please sign in to comment.