This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
thesis_tm/find_superbubbles.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
215 lines (189 sloc)
8.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import Graph | |
import sys | |
# import find_bubbles | |
import functions | |
import pdb | |
""" | |
Pseudo code | |
for node s in nodes: | |
push s in S | |
for direction in s: | |
while len(s) > 0: | |
pop arbitrary v from S | |
v.visited = True | |
if v has no children in direction: | |
break | |
for u in v's children in direction: | |
u_parents = parents from where v connects to u | |
if u == s: | |
break | |
u.seen = True | |
if all of u_parents are visited: | |
push u into S | |
if len(S) == 1 (contains t): | |
and no edge between t and s: | |
return t | |
else | |
break | |
""" | |
def find_superbubbles_alg(graph, s, simple_bub=False): | |
for direction in [1, 0]: | |
seen = set() | |
nodes_inside = [] | |
seen.add((s.id, direction)) | |
S = {(s, direction)} | |
# print("in drection {}".format(direction)) | |
while len(S) > 0: | |
# print("##### beginning of while") | |
to_print = [] | |
for i in S: | |
to_print.append((i[0].id, i[1])) | |
# print("S is now {}".format(to_print)) | |
# print("Seen is now {}".format(seen)) | |
v = S.pop() | |
v = (v[0], v[1]) | |
# print("v is {}".format(v[0].id)) | |
v[0].visited = True | |
nodes_inside.append(v[0]) | |
# print("nodes to reset are") | |
# to_print = [] | |
# for node in nodes_inside: | |
# to_print.append(node.id) | |
# print(to_print) | |
# print("seen nodes are") | |
try: | |
seen.remove((v[0].id, v[1])) | |
except KeyError: | |
# path = functions.bfs(graph.nodes, s.id, 100) | |
# print(path) | |
# functions.write_gfa(graph.nodes, k=61, list_of_nodes=path, output_file="error_region.gfa") | |
print("We entered from {} a key error happened, {}".format(s.id, (v[0].id, v[1]))) | |
sys.exit() | |
if v[1] == 0: | |
children = v[0].start | |
else: | |
children = v[0].end | |
# print("the children of v from {} are {}".format(v[1], children)) | |
# we have a tip | |
if len(children) == 0: | |
# print("breaking because children are 0") | |
break | |
for u in children: | |
# u is a tuple of (id, direction) direction is where the parent is | |
# nodes_inside.append(u[0]) | |
if u[1] == 0: | |
u_child_direction = 1 | |
u_parents = [x[0] for x in graph.nodes[u[0]].start] | |
# print("u is {} and it's parents at start are {}".format(u[0], u_parents)) | |
else: | |
u_child_direction = 0 | |
u_parents = [x[0] for x in graph.nodes[u[0]].end] | |
# print("u is {} and it's parents at end are {}".format(u[0], u_parents)) | |
# we have a loop | |
# print("{} {}".format(u[0], s.id)) | |
if u[0] == s.id: | |
# print("started from {}, breaking because we reached {} and we have a cycle".format(s.id, u[0])) | |
# resetting os I can exit the while loop | |
S = set() | |
break | |
# graph.nodes[u[0]].seen = True | |
if u[1] == 0: | |
seen.add((u[0], 1)) | |
else: | |
seen.add((u[0], 0)) | |
# print("the seen nodes are {}".format(seen)) | |
# if all parents are visited | |
if all(graph.nodes[i].visited is True for i in u_parents): | |
S.add((graph.nodes[u[0]], u_child_direction)) | |
# print("{} is added to S".format(u[0])) | |
# if (len(S) == 1) and all(graph.nodes[i].seen == False for i in nodes_inside): | |
# pdb.set_trace() | |
if (len(S) == 1) and (len(seen) == 1): | |
# if we are here, then we found a superbubble | |
# check edge if exists or not | |
t = S.pop() | |
nodes_inside.append(t[0]) | |
# skipping empty bubbles (if graph is compacted, this should not happen anyway | |
if len(nodes_inside) == 2: | |
break | |
# if simple_bub is False, so we don't want to find simple bubbles, if nodes_inside is 4 | |
# it means we have a simple bubbles (source, sink, 2 branches) | |
if not simple_bub: | |
if len(nodes_inside) < 5: | |
for node in nodes_inside: | |
node.visited = False | |
break | |
t[0].visited = True | |
# Because the algorithm works on a directed graph and ours is bi-directed | |
# each superbubble will be found twice, this way I can avoid having it twice | |
# probably there are better ways to do it | |
# let's have the key as a sorted tuple of source and sink | |
key = [int(t[0].id), int(s.id)] | |
# if s.id in [420873, 561675, 4319621, 17748142, 21990992, 30016936, 952312, 28810446, | |
# 1613350, 18374397, 404538, 24852377, 153123, 14626409, 5603293, 23778560, | |
# 4679492, 19467073, 6627782, 20000753, 6857139, 25741059, 17475025, 21106222, | |
# 2218074, 5551865, 3469392, 8448464]: | |
# pdb.set_trace() | |
key.sort() | |
key = (key[0], key[1]) | |
if key not in graph.superbubbles: | |
nodes_inside.remove(s) | |
nodes_inside.remove(t[0]) | |
superbubble = Graph.Superbubble(source=s, sink=t[0], inside=nodes_inside) | |
# if s.id in [420873, 561675, 4319621, 17748142, 21990992, 30016936, 952312, 28810446, | |
# 1613350, 18374397, 404538, 24852377, 153123, 14626409, 5603293, 23778560, | |
# 4679492, 19467073, 6627782, 20000753, 6857139, 25741059, 17475025, 21106222, | |
# 2218074, 5551865, 3469392, 8448464]: | |
# pdb.set_trace() | |
superbubble.sb_membership(key, graph) | |
graph.superbubbles[key] = superbubble | |
# print(superbubble.list_superbubble()) | |
else: | |
break | |
# removing simple bubbles inside sbs (old stuff) | |
# print("Source and Sink were, {} , {}".format(str(s.id), str(t[0].id))) | |
# simple_bubble_to_remove = [] | |
# for n in superbubble.list_superbubble(no_ends=True): | |
# if graph.nodes[n].which_chain is not 0: | |
# if graph.nodes[n].which_chain not in simple_bubble_to_remove: | |
# simple_bubble_to_remove.append(graph.nodes[n].which_chain) | |
# for key_to_remove in simple_bubble_to_remove: | |
# # todo this was happening because of nested superbubbles, | |
# # I can keep the try except or work on removing nested superbubbles | |
# try: | |
# del graph.bubble_chains[key_to_remove] | |
# except KeyError: | |
# pdb.set_trace() | |
# This is done here in case it failed to find one so i need to reset the ones I visited | |
for node in nodes_inside: | |
node.visited = False | |
# print("failed") | |
def find_superbubbles(graph, simple_bub=False): | |
counter = 0 | |
for n in graph.nodes.values(): | |
# print("at node {}".format(n.id)) | |
counter += 1 | |
if (counter % 1000000) == 0: | |
print("{} nodes have been processed so far".format(counter)) | |
if n.which_sb is 0: | |
if not n.visited: | |
find_superbubbles_alg(graph, n, simple_bub) | |
# The idea here is that a source can be a sink of another sb (and vise versa) | |
# So I set the inside of the sb as true so I don't visit there again, but I want to visit the ends again | |
for sb in graph.superbubbles.values(): | |
sb.source.visited = False | |
sb.sink.visited = False | |
for n in sb.inside: | |
n.visited = True | |
if __name__ == "__main__": | |
start = time.time() | |
new_graph = Graph.Graph() | |
new_graph.read_gfa(sys.argv[1]) | |
new_graph.k = int(sys.argv[2]) | |
find_superbubbles(new_graph) | |
for sb in new_graph.superbubbles.values(): | |
print("source is {} and sink is {}".format(sb.source.id, sb.sink.id)) | |
sb.print_inside() | |
end = time.time() | |
print("It took {} to find Superbubbles in the graph".format(end - start)) |