Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import time
import Graph
import sys
# import find_bubbles
import functions
import pdb
"""
Pseudo code
for node s in nodes:
push s in S
for direction in s:
while len(s) > 0:
pop arbitrary v from S
v.visited = True
if v has no children in direction:
break
for u in v's children in direction:
u_parents = parents from where v connects to u
if u == s:
break
u.seen = True
if all of u_parents are visited:
push u into S
if len(S) == 1 (contains t):
and no edge between t and s:
return t
else
break
"""
def find_superbubbles_alg(graph, s, simple_bub=False):
for direction in [1, 0]:
seen = set()
nodes_inside = []
seen.add((s.id, direction))
S = {(s, direction)}
# print("in drection {}".format(direction))
while len(S) > 0:
# print("##### beginning of while")
to_print = []
for i in S:
to_print.append((i[0].id, i[1]))
# print("S is now {}".format(to_print))
# print("Seen is now {}".format(seen))
v = S.pop()
v = (v[0], v[1])
# print("v is {}".format(v[0].id))
v[0].visited = True
nodes_inside.append(v[0])
# print("nodes to reset are")
# to_print = []
# for node in nodes_inside:
# to_print.append(node.id)
# print(to_print)
# print("seen nodes are")
try:
seen.remove((v[0].id, v[1]))
except KeyError:
# path = functions.bfs(graph.nodes, s.id, 100)
# print(path)
# functions.write_gfa(graph.nodes, k=61, list_of_nodes=path, output_file="error_region.gfa")
print("We entered from {} a key error happened, {}".format(s.id, (v[0].id, v[1])))
sys.exit()
if v[1] == 0:
children = v[0].start
else:
children = v[0].end
# print("the children of v from {} are {}".format(v[1], children))
# we have a tip
if len(children) == 0:
# print("breaking because children are 0")
break
for u in children:
# u is a tuple of (id, direction) direction is where the parent is
# nodes_inside.append(u[0])
if u[1] == 0:
u_child_direction = 1
u_parents = [x[0] for x in graph.nodes[u[0]].start]
# print("u is {} and it's parents at start are {}".format(u[0], u_parents))
else:
u_child_direction = 0
u_parents = [x[0] for x in graph.nodes[u[0]].end]
# print("u is {} and it's parents at end are {}".format(u[0], u_parents))
# we have a loop
# print("{} {}".format(u[0], s.id))
if u[0] == s.id:
# print("started from {}, breaking because we reached {} and we have a cycle".format(s.id, u[0]))
# resetting os I can exit the while loop
S = set()
break
# graph.nodes[u[0]].seen = True
if u[1] == 0:
seen.add((u[0], 1))
else:
seen.add((u[0], 0))
# print("the seen nodes are {}".format(seen))
# if all parents are visited
if all(graph.nodes[i].visited is True for i in u_parents):
S.add((graph.nodes[u[0]], u_child_direction))
# print("{} is added to S".format(u[0]))
# if (len(S) == 1) and all(graph.nodes[i].seen == False for i in nodes_inside):
# pdb.set_trace()
if (len(S) == 1) and (len(seen) == 1):
# if we are here, then we found a superbubble
# check edge if exists or not
t = S.pop()
nodes_inside.append(t[0])
# skipping empty bubbles (if graph is compacted, this should not happen anyway
if len(nodes_inside) == 2:
break
# if simple_bub is False, so we don't want to find simple bubbles, if nodes_inside is 4
# it means we have a simple bubbles (source, sink, 2 branches)
if not simple_bub:
if len(nodes_inside) < 5:
for node in nodes_inside:
node.visited = False
break
t[0].visited = True
# Because the algorithm works on a directed graph and ours is bi-directed
# each superbubble will be found twice, this way I can avoid having it twice
# probably there are better ways to do it
# let's have the key as a sorted tuple of source and sink
key = [int(t[0].id), int(s.id)]
# if s.id in [420873, 561675, 4319621, 17748142, 21990992, 30016936, 952312, 28810446,
# 1613350, 18374397, 404538, 24852377, 153123, 14626409, 5603293, 23778560,
# 4679492, 19467073, 6627782, 20000753, 6857139, 25741059, 17475025, 21106222,
# 2218074, 5551865, 3469392, 8448464]:
# pdb.set_trace()
key.sort()
key = (key[0], key[1])
if key not in graph.superbubbles:
nodes_inside.remove(s)
nodes_inside.remove(t[0])
superbubble = Graph.Superbubble(source=s, sink=t[0], inside=nodes_inside)
# if s.id in [420873, 561675, 4319621, 17748142, 21990992, 30016936, 952312, 28810446,
# 1613350, 18374397, 404538, 24852377, 153123, 14626409, 5603293, 23778560,
# 4679492, 19467073, 6627782, 20000753, 6857139, 25741059, 17475025, 21106222,
# 2218074, 5551865, 3469392, 8448464]:
# pdb.set_trace()
superbubble.sb_membership(key, graph)
graph.superbubbles[key] = superbubble
# print(superbubble.list_superbubble())
else:
break
# removing simple bubbles inside sbs (old stuff)
# print("Source and Sink were, {} , {}".format(str(s.id), str(t[0].id)))
# simple_bubble_to_remove = []
# for n in superbubble.list_superbubble(no_ends=True):
# if graph.nodes[n].which_chain is not 0:
# if graph.nodes[n].which_chain not in simple_bubble_to_remove:
# simple_bubble_to_remove.append(graph.nodes[n].which_chain)
# for key_to_remove in simple_bubble_to_remove:
# # todo this was happening because of nested superbubbles,
# # I can keep the try except or work on removing nested superbubbles
# try:
# del graph.bubble_chains[key_to_remove]
# except KeyError:
# pdb.set_trace()
# This is done here in case it failed to find one so i need to reset the ones I visited
for node in nodes_inside:
node.visited = False
# print("failed")
def find_superbubbles(graph, simple_bub=False):
counter = 0
for n in graph.nodes.values():
# print("at node {}".format(n.id))
counter += 1
if (counter % 1000000) == 0:
print("{} nodes have been processed so far".format(counter))
if n.which_sb is 0:
if not n.visited:
find_superbubbles_alg(graph, n, simple_bub)
# The idea here is that a source can be a sink of another sb (and vise versa)
# So I set the inside of the sb as true so I don't visit there again, but I want to visit the ends again
for sb in graph.superbubbles.values():
sb.source.visited = False
sb.sink.visited = False
for n in sb.inside:
n.visited = True
if __name__ == "__main__":
start = time.time()
new_graph = Graph.Graph()
new_graph.read_gfa(sys.argv[1])
new_graph.k = int(sys.argv[2])
find_superbubbles(new_graph)
for sb in new_graph.superbubbles.values():
print("source is {} and sink is {}".format(sb.source.id, sb.sink.id))
sb.print_inside()
end = time.time()
print("It took {} to find Superbubbles in the graph".format(end - start))