This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
thesis_tm/find_components.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
144 lines (120 sloc)
4.29 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import time | |
import os | |
from functions import write_gfa | |
from Graph import Graph | |
# In case I really needed a fast FIFO, this class can be used | |
# class Queue: | |
# """A sample implementation of a First-In-First-Out | |
# data structure.""" | |
# def __init__(self): | |
# self.in_stack = [] | |
# self.out_stack = [] | |
# | |
# def append(self, obj): | |
# self.in_stack.append(obj) | |
# | |
# def pop(self): | |
# if not self.out_stack: | |
# while self.in_stack: | |
# self.out_stack.append(self.in_stack.pop()) | |
# return self.out_stack.pop() | |
# | |
# def __len__(self): | |
# return len(self.in_stack) | |
def comp_bfs(nodes, start_node): | |
""" | |
:param nodes: is the dictionary of nodes objects. | |
:param start_node: The start node of BFS search. | |
:return: | |
""" | |
queue = [] | |
cc = set() | |
queue.append(start_node) | |
# cc.add(start_node) | |
# a list of all adjacent nodes to the start_node | |
neighbors = list( | |
set( | |
[x[0] for x in nodes[start_node].start] # just the node id, no direction | |
+ | |
[x[0] for x in nodes[start_node].end] # just the node id, no direction | |
) | |
) | |
# print(neighbors) | |
if len(neighbors) == 0: | |
cc.add(start_node) | |
return list(cc) | |
while len(queue) > 0: | |
# if not queue: | |
# for i in path: | |
# nodes[i].visited = True | |
# return path | |
# if len(cc) % 1000 == 0: | |
# end = time.time() | |
# print("To add 10,000 nodes to cc, it took {} seconds".format(end-start)) | |
# start = time.time() | |
start = queue.pop() | |
if start not in cc: | |
cc.add(start) | |
else: | |
continue | |
nodes[start].visited = True | |
# a list of all adjacent nodes to the start node | |
neighbors = list( | |
set( | |
[x[0] for x in nodes[start].start] # just the node id, no direction | |
+ | |
[x[0] for x in nodes[start].end] # just the node id, no direction | |
) | |
) | |
# I think the bottle neck is here in checking if items in queue | |
# I need to check that further, I believe that the bigger the queue the longer it takes | |
# to check if a node is there already. | |
for n in neighbors: | |
try: | |
if not nodes[n].visited: | |
queue.append(n) | |
except KeyError: | |
continue | |
return list(cc) | |
def connected_components(graph): | |
con_comp = [] | |
for n in graph.nodes: | |
if not graph.nodes[n].visited: | |
# print("starting the component with {}".format(n)) | |
con_comp.append(comp_bfs(graph.nodes, n)) | |
# print(con_comp[-1]) | |
# print("The length of the component is {}".format(len(con_comp[-1]))) | |
# some test pring that I need to remove later | |
# if len(con_comp) & 1000000 == 0: | |
# print("Still calculating shit, take a chill pill") | |
return con_comp | |
if __name__ == "__main__": | |
start = time.time() | |
if len(sys.argv) != 4: | |
print("You need to give 3 arguments, <input_gfa> <k> <output_gfa>") | |
print("The k is just needed to write a proper output GFA file with the correct " | |
"number of overlap between nodes.") | |
sys.exit(0) | |
if os.path.isfile(sys.argv[1]): | |
print("Reading file\n") | |
new_g = Graph() | |
new_g.read_gfa(sys.argv[1]) | |
print("Number of nodes is {}".format(len(new_g.nodes))) | |
con_comp = connected_components(new_g) | |
# print("finished finding CCs, looking for the biggest one") | |
# find biggest component | |
biggest_com = (0, 0) | |
for idx, con_com in enumerate(con_comp): | |
if len(con_com) > biggest_com[1]: | |
biggest_com = (idx, len(con_com)) | |
print("the number of connected components is {}".format(len(con_comp))) | |
print("The biggest component is of size {}".format(biggest_com[1])) | |
# I am outputting biggest component | |
write_gfa(nodes=new_g.nodes, list_of_nodes=con_comp[biggest_com[0]], | |
output_file=sys.argv[3], k=sys.argv[2]) | |
else: | |
print("The file doesn't exist") | |
sys.exit() | |
end = time.time() | |
print("it took {} seconds to finish".format(end - start)) |