import os
import sys
import time
# import copy
from Graph import Graph
from optparse import OptionParser
import find_components
import functions
import compact_gfa
import find_bubbles
import find_superbubbles
import pdb
parser = OptionParser()
parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str,
help="Give the gfa file destination here")
parser.add_option("-k", "--k_mer", action="store", dest="k_mer", default=None, type=int,
help="Give the K as integer here")
parser.add_option("--compact", action="store", dest="compact", type=str, default=None,
help="This option will compact your graph and output the compacted one, "
"give the output destination after this option")
parser.add_option("--bfs", action="store_true", dest="bfs_start", default=False,
help="This option will do BFS from the start node given with a neighborhood of size n"
", give the following arguments <input_gfa> <start_node_id> <n> <output_file>")
parser.add_option("--find_bubbles", action="store_true", dest="find_bubbles", default=False,
help="This option will find bubble chains and print the statistics to terminal, "
"give the following arguments <input_gfa> <k>")
parser.add_option("--out_bubbles", action="store", dest="output_bubbles", default=None, type=str,
help="Only be used after --bubbles, will output only bubble chains in a separate GFA file "
"the output file name is given after it. e.g.: --out_bubbles <output_gfa>")
parser.add_option("--plot", action="store", dest="plot", default=None, type=str,
help="Only used after --bubbles, outputs statistics plots in the given directory, "
"e.g.: --plot <output_dir>")
parser.add_option("--components_plot", action="store_true", dest="comp_plot", default=False,
help="Only used after --bubbles --plot, gives a histogram of the distribution of connected"
" components in the graph. e.g.: --components_plot <output_file>")
parser.add_option("--plot_contigs_nobub", action="store_true", dest="plot_contigs_dist", default=False,
help="Only used after --bubbles and --components_plot, this will output the contigs "
"distribution of the graph after removing the bubble chains."
" e.g.: --plot_contigs_nobub <output_file_path.png>")
parser.add_option("--output_component", action="store", dest="output_components", default=None, type=str,
help="If used after --bubbles, then you only need to give it one argument <output_gfa>"
" .If it was used alone then you need to give <input_gfa> <k> as raw argument to main. "
"Example: python <input_gfa> <k> --output_component <output_gfa>")
total_t_start = time.time()
(options, args) = parser.parse_args()
if options.in_graph is not None:
if os.path.exists(options.in_graph):
start = time.time()
new_graph = Graph()
print("Reading graph...")
print("length of the graph is {}".format(len(new_graph)))
end = time.time()
print("Time took to read the file is {} seconds".format(end - start))
if options.k_mer is not None:
new_graph.k = options.k_mer
print("You didn't give the value of k for the k-mers")
print("Could not find the input GFA file, check the path")
if options.compact is not None:
# new_graph = Graph()
# new_graph.read_gfa(args[1])
# new_graph.k = int(args[2])
# k_1 = new_graph.k -1
new_graph.nodes = compact_gfa.compact_graph(new_graph.nodes, new_graph.k)
functions.write_gfa(nodes=new_graph.nodes, k=new_graph.k, output_file=options.compact)
# if len(sys.argv) == 1:
# print("No options or arguments given\n")
# parser.print_help()
# sys.exit(0)
# new_graph = Graph()
# I might want to change this later that I only read the file inside the option I want to do
# if args[0].endswith("gfa"):
# if os.path.isfile(args[0]):
# print("Reading graph...")
# new_graph.read_gfa(args[0])
# new_graph.k = int(args[1]) - 1
if options.find_bubbles:
print("Now finding Superbubbles")
# new_graph.nodes_reset()
find_superbubbles.find_superbubbles(new_graph, simple_bub=False)
print("Finished finding Superbubbles\n")
print("Calling find bubbles")
# new_graph.find_bubbles()
print("Done finding bubbles")
print("The number of singular bubbles is {}".format(new_graph.num_single_bubbles()))
# small test
# all_nodes = []
# for chain in new_graph.bubble_chains.values():
# all_nodes += chain.list_chain()
# for sb in new_graph.superbubbles.values():
# all_nodes += sb.list_superbubble()
# print("all the nodes are {}".format(len(all_nodes)))
print("The graph has {} nodes".format(len(new_graph.nodes)))
# end of test
longest = new_graph.max_chain()
longest_chain = new_graph.longest_chain(new_graph.k)
# print(longest_chain)
print("Longest chain bubble-wise has {} bubbles and length"
" of {}\n".format(longest[1], new_graph.bubble_chains[longest[0]].chain_length(new_graph.k)))
print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest[0]].ends))
print("Longest chain sequence-wise has {} bp and"
" {} bubbles\n".format(longest_chain[1], new_graph.bubble_chains[longest_chain[0]].length()))
print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest_chain[0]].ends))
# print(new_graph.bubble_chains[longest].list_chain())
print("The bubble chains covered {}% nodes in the graph".format(
k_1 = new_graph.k - 1
print("length of the graph is {}".format(len(new_graph)))
total_seq = k_1
for n in new_graph.nodes.values():
n.visited = False
total_seq += (len(n.seq) - k_1)
sb_coverage = new_graph.k - 1
for sb in new_graph.superbubbles.values():
sb_coverage += sb.length(k=new_graph.k)
print("The percentage of sequences covered by superbubbles is {}%".format(
(sb_coverage * 100) / float(total_seq)
# print(all_nodes)
print("The percentage of sequences covered by chains is {}%\n".format(
((new_graph.chains_coverage(new_graph.k) + k_1)*100)/float(total_seq)
# sb_coverage = 0
# for sb in new_graph.superbubbles.values():
# sb_coverage += sb.length(k=k_1)
# print("The percentage of sequences covered by Superbubbles is {}%").format(
# (sb_coverage*100)/float(total_seq)
# )
if options.plot is not None:
import matplotlib.pyplot as plt
new_graph.plots(k=new_graph.k, output_dir=options.plot)
if options.comp_plot:
print("in components plot")
# pdb.set_trace()
con_comp = find_components.connected_components(new_graph)
# print(con_comp)
comp_dist = []
for cc in con_comp:
# print("the comp_dist set is {}".format(set(comp_dist)))
plt.title("Connected Components Distribution", fontsize=12)
plt.xlabel("Size of Connected Component", fontsize=9)
plt.ylabel("Frequency of Connected Component", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(options.plot, "components_dis_plot.png")
plt.savefig(output_file, dpi=200)
if options.plot_contigs_dist:
contigs_dist = []
for node in new_graph.nodes.values():
len_of_contigs = sum(contigs_dist)
# checking how many nodes are bigger than 2*k
big_contigs = [x for x in contigs_dist if x > 122]
big_contigs_sum = sum(big_contigs)
print("The nodes bigger than 2k in the nodes not in bubbles make up for {}% of all the contigs "
"not in the bubbles".format((big_contigs_sum*100)/len_of_contigs))
# pdb.set_trace()
plt.title("Contigs distribution without bubbles", fontsize=12)
plt.xlabel("Contig Length", fontsize=9)
plt.ylabel("Frequency of Contig", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(options.plot, "contigs_dis_without_bubbles.png")
plt.savefig(output_file, dpi=200)
# outputting bubble chains
if options.output_bubbles is not None:
print("in output_bubbles")
functions.write_bubbles(new_graph, options.output_bubbles)
# print(new_graph)
# debugging prints
# for chain in new_graph.bubble_chains:
# print(chain.list_chain())
if options.output_components is not None:
con_comp = find_components.connected_components(new_graph)
biggest_com = (0, 0)
for idx, cc in enumerate(con_comp):
if len(cc) > biggest_com[1]:
biggest_com = (idx, len(cc))
print("the number of connected components is {}".format(len(con_comp)))
print("The biggest component is of size {}".format(biggest_com[1]))
# need to check arguments' stuff
print("writing biggest component gfa file")
k_1 = new_graph.k - 1
functions.write_gfa(nodes=new_graph.nodes, list_of_nodes=con_comp[biggest_com[0]],
output_file=options.output_components, k=k_1)
total_t_end = time.time()
print("The total time for the whole script to run was {}".format(total_t_end - total_t_start))