Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
4fad16beb9
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
262 lines (209 sloc) 10.3 KB
import os
import sys
import time
# import copy
from Graph import Graph
from optparse import OptionParser
import find_components
import functions
import compact_gfa
import find_bubbles
import find_superbubbles
import pdb
parser = OptionParser()
parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str,
help="Give the gfa file destination here")
parser.add_option("-k", "--k_mer", action="store", dest="k_mer", default=None, type=int,
help="Give the K as integer here")
parser.add_option("--compact", action="store", dest="compact", type=str, default=None,
help="This option will compact your graph and output the compacted one, "
"give the output destination after this option")
parser.add_option("--bfs", action="store_true", dest="bfs_start", default=False,
help="This option will do BFS from the start node given with a neighborhood of size n"
", give the following arguments <input_gfa> <start_node_id> <n> <output_file>")
parser.add_option("--find_bubbles", action="store_true", dest="find_bubbles", default=False,
help="This option will find bubble chains and print the statistics to terminal, "
"give the following arguments <input_gfa> <k>")
parser.add_option("--out_bubbles", action="store", dest="output_bubbles", default=None, type=str,
help="Only be used after --bubbles, will output only bubble chains in a separate GFA file "
"the output file name is given after it. e.g.: --out_bubbles <output_gfa>")
parser.add_option("--plot", action="store", dest="plot", default=None, type=str,
help="Only used after --bubbles, outputs statistics plots in the given directory, "
"e.g.: --plot <output_dir>")
parser.add_option("--components_plot", action="store_true", dest="comp_plot", default=False,
help="Only used after --bubbles --plot, gives a histogram of the distribution of connected"
" components in the graph. e.g.: --components_plot <output_file>")
parser.add_option("--plot_contigs_nobub", action="store_true", dest="plot_contigs_dist", default=False,
help="Only used after --bubbles and --components_plot, this will output the contigs "
"distribution of the graph after removing the bubble chains."
" e.g.: --plot_contigs_nobub <output_file_path.png>")
parser.add_option("--output_component", action="store", dest="output_components", default=None, type=str,
help="If used after --bubbles, then you only need to give it one argument <output_gfa>"
" .If it was used alone then you need to give <input_gfa> <k> as raw argument to main. "
"Example: python main.py <input_gfa> <k> --output_component <output_gfa>")
total_t_start = time.time()
(options, args) = parser.parse_args()
if options.in_graph is not None:
if os.path.exists(options.in_graph):
start = time.time()
new_graph = Graph()
print("Reading graph...")
new_graph.read_gfa(options.in_graph)
print("length of the graph is {}".format(len(new_graph)))
end = time.time()
print("Time took to read the file is {} seconds".format(end - start))
if options.k_mer is not None:
new_graph.k = options.k_mer
else:
print("You didn't give the value of k for the k-mers")
parser.print_help()
sys.exit()
else:
print("Could not find the input GFA file, check the path")
parser.print_help()
sys.exit()
else:
parser.print_help()
sys.exit()
if options.compact is not None:
# new_graph = Graph()
# new_graph.read_gfa(args[1])
# new_graph.k = int(args[2])
# k_1 = new_graph.k -1
new_graph.nodes = compact_gfa.compact_graph(new_graph.nodes, new_graph.k)
functions.write_gfa(nodes=new_graph.nodes, k=new_graph.k, output_file=options.compact)
#
# if len(sys.argv) == 1:
# print("No options or arguments given\n")
# parser.print_help()
# sys.exit(0)
# new_graph = Graph()
# I might want to change this later that I only read the file inside the option I want to do
# if args[0].endswith("gfa"):
# if os.path.isfile(args[0]):
# print("Reading graph...")
# new_graph.read_gfa(args[0])
# new_graph.k = int(args[1]) - 1
if options.find_bubbles:
print("Now finding Superbubbles")
# new_graph.nodes_reset()
find_superbubbles.find_superbubbles(new_graph, simple_bub=False)
print("Finished finding Superbubbles\n")
print("Calling find bubbles")
# new_graph.find_bubbles()
find_bubbles.find_bubbles(new_graph)
print("Done finding bubbles")
functions.merge_sb_with_bchains(new_graph)
print("The number of singular bubbles is {}".format(new_graph.num_single_bubbles()))
# small test
# all_nodes = []
# for chain in new_graph.bubble_chains.values():
# all_nodes += chain.list_chain()
# for sb in new_graph.superbubbles.values():
# all_nodes += sb.list_superbubble()
# print("all the nodes are {}".format(len(all_nodes)))
print("The graph has {} nodes".format(len(new_graph.nodes)))
# end of test
longest = new_graph.max_chain()
longest_chain = new_graph.longest_chain(new_graph.k)
# print(longest_chain)
print("Longest chain bubble-wise has {} bubbles and length"
" of {}\n".format(longest[1], new_graph.bubble_chains[longest[0]].chain_length(new_graph.k)))
print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest[0]].ends))
print("Longest chain sequence-wise has {} bp and"
" {} bubbles\n".format(longest_chain[1], new_graph.bubble_chains[longest_chain[0]].length()))
print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest_chain[0]].ends))
# print(new_graph.bubble_chains[longest].list_chain())
print("The bubble chains covered {}% nodes in the graph".format(
(new_graph.nodes_in_chains()*100)/len(new_graph.nodes))
)
k_1 = new_graph.k - 1
print("length of the graph is {}".format(len(new_graph)))
print(k_1)
total_seq = k_1
for n in new_graph.nodes.values():
n.visited = False
total_seq += (len(n.seq) - k_1)
sb_coverage = new_graph.k - 1
for sb in new_graph.superbubbles.values():
sb_coverage += sb.length(k=new_graph.k)
print("The percentage of sequences covered by superbubbles is {}%".format(
(sb_coverage * 100) / float(total_seq)
))
# print(all_nodes)
print(new_graph.chains_coverage(new_graph.k))
print(total_seq)
print("The percentage of sequences covered by chains is {}%\n".format(
((new_graph.chains_coverage(new_graph.k) + k_1)*100)/float(total_seq)
))
# sb_coverage = 0
# for sb in new_graph.superbubbles.values():
# sb_coverage += sb.length(k=k_1)
#
# print("The percentage of sequences covered by Superbubbles is {}%").format(
# (sb_coverage*100)/float(total_seq)
# )
if options.plot is not None:
import matplotlib.pyplot as plt
new_graph.plots(k=new_graph.k, output_dir=options.plot)
if options.comp_plot:
print("in components plot")
functions.remove_chains(new_graph)
new_graph.nodes_reset()
# pdb.set_trace()
con_comp = find_components.connected_components(new_graph)
# print(con_comp)
comp_dist = []
for cc in con_comp:
comp_dist.append(len(cc))
# print("the comp_dist set is {}".format(set(comp_dist)))
plt.hist(comp_dist)
plt.title("Connected Components Distribution", fontsize=12)
plt.xlabel("Size of Connected Component", fontsize=9)
plt.ylabel("Frequency of Connected Component", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(options.plot, "components_dis_plot.png")
plt.savefig(output_file, dpi=200)
plt.clf()
if options.plot_contigs_dist:
contigs_dist = []
for node in new_graph.nodes.values():
contigs_dist.append(node.seq_len)
len_of_contigs = sum(contigs_dist)
# checking how many nodes are bigger than 2*k
big_contigs = [x for x in contigs_dist if x > 122]
big_contigs_sum = sum(big_contigs)
print("The nodes bigger than 2k in the nodes not in bubbles make up for {}% of all the contigs "
"not in the bubbles".format((big_contigs_sum*100)/len_of_contigs))
# pdb.set_trace()
plt.hist(contigs_dist)
plt.title("Contigs distribution without bubbles", fontsize=12)
plt.xlabel("Contig Length", fontsize=9)
plt.ylabel("Frequency of Contig", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(options.plot, "contigs_dis_without_bubbles.png")
plt.savefig(output_file, dpi=200)
plt.clf()
# outputting bubble chains
if options.output_bubbles is not None:
print("in output_bubbles")
functions.write_bubbles(new_graph, options.output_bubbles)
# print(new_graph)
# debugging prints
# for chain in new_graph.bubble_chains:
# print(chain.list_chain())
if options.output_components is not None:
con_comp = find_components.connected_components(new_graph)
biggest_com = (0, 0)
for idx, cc in enumerate(con_comp):
if len(cc) > biggest_com[1]:
biggest_com = (idx, len(cc))
print("the number of connected components is {}".format(len(con_comp)))
print("The biggest component is of size {}".format(biggest_com[1]))
# need to check arguments' stuff
print("writing biggest component gfa file")
k_1 = new_graph.k - 1
functions.write_gfa(nodes=new_graph.nodes, list_of_nodes=con_comp[biggest_com[0]],
output_file=options.output_components, k=k_1)
total_t_end = time.time()
print("The total time for the whole script to run was {}".format(total_t_end - total_t_start))