Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import sys
import time
import os
from functions import write_gfa
from Graph import Graph
# In case I really needed a fast FIFO, this class can be used
# class Queue:
# """A sample implementation of a First-In-First-Out
# data structure."""
# def __init__(self):
# self.in_stack = []
# self.out_stack = []
#
# def append(self, obj):
# self.in_stack.append(obj)
#
# def pop(self):
# if not self.out_stack:
# while self.in_stack:
# self.out_stack.append(self.in_stack.pop())
# return self.out_stack.pop()
#
# def __len__(self):
# return len(self.in_stack)
def comp_bfs(nodes, start_node):
"""
:param nodes: is the dictionary of nodes objects.
:param start_node: The start node of BFS search.
:return:
"""
queue = []
cc = set()
queue.append(start_node)
# cc.add(start_node)
# a list of all adjacent nodes to the start_node
neighbors = list(
set(
[x[0] for x in nodes[start_node].start] # just the node id, no direction
+
[x[0] for x in nodes[start_node].end] # just the node id, no direction
)
)
# print(neighbors)
if len(neighbors) == 0:
cc.add(start_node)
return list(cc)
while len(queue) > 0:
# if not queue:
# for i in path:
# nodes[i].visited = True
# return path
# if len(cc) % 1000 == 0:
# end = time.time()
# print("To add 10,000 nodes to cc, it took {} seconds".format(end-start))
# start = time.time()
start = queue.pop()
if start not in cc:
cc.add(start)
else:
continue
nodes[start].visited = True
# a list of all adjacent nodes to the start node
neighbors = list(
set(
[x[0] for x in nodes[start].start] # just the node id, no direction
+
[x[0] for x in nodes[start].end] # just the node id, no direction
)
)
# I think the bottle neck is here in checking if items in queue
# I need to check that further, I believe that the bigger the queue the longer it takes
# to check if a node is there already.
for n in neighbors:
try:
if not nodes[n].visited:
queue.append(n)
except KeyError:
continue
return list(cc)
def connected_components(graph):
con_comp = []
for n in graph.nodes:
if not graph.nodes[n].visited:
# print("starting the component with {}".format(n))
con_comp.append(comp_bfs(graph.nodes, n))
# print(con_comp[-1])
# print("The length of the component is {}".format(len(con_comp[-1])))
# some test pring that I need to remove later
# if len(con_comp) & 1000000 == 0:
# print("Still calculating shit, take a chill pill")
return con_comp
if __name__ == "__main__":
start = time.time()
if len(sys.argv) != 4:
print("You need to give 3 arguments, <input_gfa> <k> <output_gfa>")
print("The k is just needed to write a proper output GFA file with the correct "
"number of overlap between nodes.")
sys.exit(0)
if os.path.isfile(sys.argv[1]):
print("Reading file\n")
new_g = Graph()
new_g.read_gfa(sys.argv[1])
print("Number of nodes is {}".format(len(new_g.nodes)))
con_comp = connected_components(new_g)
# print("finished finding CCs, looking for the biggest one")
# find biggest component
biggest_com = (0, 0)
for idx, con_com in enumerate(con_comp):
if len(con_com) > biggest_com[1]:
biggest_com = (idx, len(con_com))
print("the number of connected components is {}".format(len(con_comp)))
print("The biggest component is of size {}".format(biggest_com[1]))
# I am outputting biggest component
write_gfa(nodes=new_g.nodes, list_of_nodes=con_comp[biggest_com[0]],
output_file=sys.argv[3], k=sys.argv[2])
else:
print("The file doesn't exist")
sys.exit()
end = time.time()
print("it took {} seconds to finish".format(end - start))