Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
Testing network
Browse files Browse the repository at this point in the history
  • Loading branch information
msbentsen committed Jul 17, 2019
1 parent 0222aa0 commit e7199f4
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 23 deletions.
2 changes: 1 addition & 1 deletion tobias/TOBIAS.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def main():
"MergePDF": {"help": "Merge pdf files to one", "add_arguments":add_mergepdf_arguments, "function":run_mergepdf},
"MaxPos": {"help": "Get .bed-positions of highest bigwig signal within .bed-regions", "add_arguments": add_maxpos_arguments, "function": run_maxpos},
"SubsampleBam": {"help": "Subsample a .bam-file using samtools", "add_arguments": add_subsample_arguments, "function": run_subsampling},
#"CreateNetwork": {"help": "Create TF-gene network from annotated TFBS", "add_arguments": add_network_arguments, "function": run_network, "space":"\t"},
"CreateNetwork": {"help": "Create TF-gene network from annotated TFBS", "add_arguments": add_network_arguments, "function": run_network, "space":"\t"},
"Log2Table": {"help": "Convert logs from PlotAggregate to tab-delimitered tables of footprint stats", "add_arguments": add_log2table_arguments, "function": run_log2table}
}
}
Expand Down
61 changes: 39 additions & 22 deletions tobias/misc/create_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,16 @@ def add_network_arguments(parser):

#Required arguments
required = parser.add_argument_group('Required arguments')
required.add_argument('--TFBS', metavar="", help="TFBS folder from bindetect")
required.add_argument('--origin', metavar="", help="File of origins of TF to genes")
required.add_argument('--TFBS', metavar="", help="File(s) containing TFBS to create network from")
required.add_argument('--origin', metavar="", help="File containing gene origins of TF <-> gene")

#Optional arguments
optional = parser.add_argument_group('Optional arguments')
required.add_argument('--subset', metavar="", help="File containing subset of names to filter on")
optional.add_argument('--TFBS_columns', metavar="", nargs="*", help="Donor -> recipient columns", default=["TFBS_name", "gene_id"])
optional.add_argument('--origin_columns', metavar="", nargs="*", help="Name, id", default=["TOBIAS_motif_name", "ensembl_gene_id"])
optional.add_argument('--subset', metavar="", help="File containing subset of names to filter on")
optional.add_argument('--TFBS_columns', metavar="", nargs="*", help="Source TF -> target gene columns", default=["TFBS_name", "gene_id"])
optional.add_argument('--origin_columns', metavar="", nargs="*", help="Source TF, source gene", default=["TOBIAS_motif_id", "ensembl_gene_id"])

optional.add_argument('--expression', metavar="", help="Expression of each gene per timepoint")
#optional.add_argument('--additional', metavar="", help="Additional information on genes to add; for example RNA-seq")
optional.add_argument('--output', metavar="", help="Path to output directory (default: tobias_network)", default="tobias_network")

return(parser)
Expand Down Expand Up @@ -91,27 +91,26 @@ def dfs(adjacency, path, timeline, all_paths = {"paths":[], "timelines":[]}):
def run_network(args):

make_directory(args.output)

check_required(args, ["TFBS", "origin"])

#-------------------------- Origin file translating motif name -> gene origin -----------------------------------#
#translation file, where one motif can constitute more than one gene (jun::fos)
#and more genes can encode transcription factors with same motifs (close family members with same target sequence)
origin_table = pd.read_csv(args.origin, sep="\t")
origin_table = origin_table[args.origin_columns]
#origin_table = origin_table[args.origin_columns]

TFBS_donor, TFBS_recipient = args.TFBS_columns
origin_name, origin_gene = args.origin_columns
print(origin_table)

id2name = {gene_id: names_list for gene_id, names_list in origin_table.groupby(origin_gene)[origin_name].apply(list).iteritems()}

TFBS_donor, TFBS_recipient = args.TFBS_columns
origin_name, origin_gene = args.origin_columns


#id2name = {gene_id: names_list for gene_id, names_list in origin_table.groupby(origin_gene)[origin_name].apply(list).iteritems()}

#---------------------------------------------- BINDetect results --------------------------------------------#

#Get all overview files from TFBS dir
print("Getting files from {0}".format(args.TFBS))
TF_folders = list(os.walk(args.TFBS))[0][1] #list of folder names = TF names
overview_files = [f for f in glob.glob(args.TFBS + "/*/*_overview.txt")]
print("- Found results from {0} motifs".format(len(overview_files)))

Expand All @@ -125,11 +124,15 @@ def run_network(args):
print("Subset to {0} files".format(len(overview_files)))

#Read all edges to table
#find donor col in origin_table
#origin_name_col =
#todo: read in parallel
#print(overview_files)

print("Reading all overview tables")
dataframes = []
for fil in overview_files[:40]:
print(fil)
for fil in overview_files[:30]:
print("- {0}".format(fil))

df = pd.read_csv(fil, sep="\t")

Expand Down Expand Up @@ -167,8 +170,8 @@ def run_network(args):
df["donor_bound_in"] = df[condition_names].apply(lambda x: ",".join(x.index[x == 1]), axis=1)

#Select columns
selected = ["TFBS_chr", "TFBS_start", "TFBS_end", "donor_name", "donor_id", "recipient_name", "recipient_id", "donor_bound_in"]
df = df[selected]
#selected = ["TFBS_chr", "TFBS_start", "TFBS_end", "donor_name", "donor_id", "recipient_name", "recipient_id", "donor_bound_in"]
#df = df[selected]

#Add to list
dataframes.append(df)
Expand All @@ -179,7 +182,7 @@ def run_network(args):


#------------------------------------- Expression info to subset edges -----------------------------------#

"""
print("Reading expression data")
#Read expression values
expression_threshold = 50
Expand All @@ -202,23 +205,37 @@ def run_network(args):
#expression_table = "" #rows, cols, values > expression_threshold
expression_dict = expression_table.to_dict()
"""

#--------------------------------------------#



all_source_names = set(sites["donor_name"])
print(all_source_names)

print(sites.shape[0])

sites = sites[(sites["recipient_name"].isin(all_source_names))]
print(sites.shape[0])

#--------------------------------------------#


print(condition_names)
conditions = condition_names

#Subset edges on those where donor_bound in is part of recipient_expressed_in
sites["donor_bound_in"] = sites["donor_bound_in"].apply(lambda x: x.split(","))
sites["recipient_expressed_in"] = sites["recipient_expressed_in"].apply(lambda x: x.split(","))
#sites["recipient_expressed_in"] = sites["recipient_expressed_in"].apply(lambda x: x.split(","))

#Expand the bound_in columns
exploded = sites["donor_bound_in"].apply(pd.Series).stack().reset_index().rename(columns={0:"donor_bound_in_exploded"})
sites = pd.merge(sites, exploded, left_index=True, right_on="level_0", how="left").drop(columns=["level_0", "level_1", "donor_bound_in"]).rename(columns={"donor_bound_in_exploded":"donor_bound_in"})

print(sites.shape[0])
sites = sites[sites.apply(lambda x: x["donor_bound_in"] in x["recipient_expressed_in"], axis=1)]
print(sites.shape[0])
#print(sites.shape[0])
#sites = sites[sites.apply(lambda x: x["donor_bound_in"] in x["recipient_expressed_in"], axis=1)]
#print(sites.shape[0])
#print(sites)

##### Write out edges
Expand Down

0 comments on commit e7199f4

Please sign in to comment.