main.py

import os
import sys
import time
# import copy
from Graph import Graph
from optparse import OptionParser
import find_components
import functions
import compact_gfa
import find_bubbles
import find_superbubbles
import pdb


parser = OptionParser()

parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str,
                  help="Give the gfa file destination here")

parser.add_option("-k", "--k_mer", action="store", dest="k_mer", default=None, type=int,
                  help="Give the K as integer here")

parser.add_option("--compact", action="store", dest="compact", type=str, default=None,
                  help="This option will compact your graph and output the compacted one, "
                       "give the output destination after this option")

parser.add_option("--bfs", action="store_true", dest="bfs_start", default=False,
                  help="This option will do BFS from the start node given with a neighborhood of size n"
                       ", give the following arguments <input_gfa> <start_node_id> <n> <output_file>")

parser.add_option("--find_bubbles", action="store_true", dest="find_bubbles", default=False,
                  help="This option will find bubble chains and print the statistics to terminal, "
                       "give the following arguments <input_gfa> <k>")

parser.add_option("--out_bubbles", action="store", dest="output_bubbles", default=None, type=str,
                  help="Only be used after --bubbles, will output only bubble chains in a separate GFA file "
                       "the output file name is given after it. e.g.: --out_bubbles <output_gfa>")

parser.add_option("--plot", action="store", dest="plot", default=None, type=str,
                  help="Only used after --bubbles, outputs statistics plots in the given directory, "
                       "e.g.: --plot <output_dir>")

parser.add_option("--components_plot", action="store_true", dest="comp_plot", default=False,
                  help="Only used after --bubbles --plot, gives a histogram of the distribution of connected"
                       " components in the graph. e.g.: --components_plot <output_file>")

parser.add_option("--plot_contigs_nobub", action="store_true", dest="plot_contigs_dist", default=False,
                  help="Only used after --bubbles and --components_plot, this will output the contigs "
                       "distribution of the graph after removing the bubble chains."
                       " e.g.: --plot_contigs_nobub <output_file_path.png>")

parser.add_option("--output_component", action="store", dest="output_components", default=None, type=str,
                  help="If used after --bubbles, then you only need to give it one argument <output_gfa>"
                       " .If it was used alone then you need to give <input_gfa> <k> as raw argument to main. "
                       "Example: python main.py <input_gfa> <k> --output_component <output_gfa>")

total_t_start = time.time()
(options, args) = parser.parse_args()

if options.in_graph is not None:
    if os.path.exists(options.in_graph):
        start = time.time()
        new_graph = Graph()
        print("Reading graph...")
        new_graph.read_gfa(options.in_graph)
        print("length of the graph is {}".format(len(new_graph)))
        end = time.time()
        print("Time took to read the file is {} seconds".format(end - start))

        if options.k_mer is not None:
            new_graph.k = options.k_mer
        else:
            print("You didn't give the value of k for the k-mers")
            parser.print_help()
            sys.exit()
    else:
        print("Could not find the input GFA file, check the path")
        parser.print_help()
        sys.exit()
else:
    parser.print_help()
    sys.exit()

if options.compact is not None:
    # new_graph = Graph()
    # new_graph.read_gfa(args[1])
    # new_graph.k = int(args[2])
    # k_1 = new_graph.k -1
    new_graph.nodes = compact_gfa.compact_graph(new_graph.nodes, new_graph.k)
    functions.write_gfa(nodes=new_graph.nodes, k=new_graph.k, output_file=options.compact)

#
# if len(sys.argv) == 1:
#     print("No options or arguments given\n")
#     parser.print_help()
#     sys.exit(0)

# new_graph = Graph()

# I might want to change this later that I only read the file inside the option I want to do
# if args[0].endswith("gfa"):
#     if os.path.isfile(args[0]):
#         print("Reading graph...")
#         new_graph.read_gfa(args[0])
#         new_graph.k = int(args[1]) - 1

if options.find_bubbles:

    print("Now finding Superbubbles")
    # new_graph.nodes_reset()
    find_superbubbles.find_superbubbles(new_graph, simple_bub=False)
    print("Finished finding Superbubbles\n")

    print("Calling find bubbles")
    # new_graph.find_bubbles()
    find_bubbles.find_bubbles(new_graph)

    print("Done finding bubbles")

    functions.merge_sb_with_bchains(new_graph)

    print("The number of singular bubbles is {}".format(new_graph.num_single_bubbles()))
    # small test
    # all_nodes = []
    # for chain in new_graph.bubble_chains.values():
    #     all_nodes += chain.list_chain()

    # for sb in new_graph.superbubbles.values():
    #     all_nodes += sb.list_superbubble()
    # print("all the nodes are {}".format(len(all_nodes)))
    print("The graph has {} nodes".format(len(new_graph.nodes)))
    # end of test
    longest = new_graph.max_chain()
    longest_chain = new_graph.longest_chain(new_graph.k)
    # print(longest_chain)
    print("Longest chain bubble-wise has {} bubbles and length"
          " of {}\n".format(longest[1], new_graph.bubble_chains[longest[0]].chain_length(new_graph.k)))
    print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest[0]].ends))

    print("Longest chain sequence-wise has {} bp and"
          " {} bubbles\n".format(longest_chain[1], new_graph.bubble_chains[longest_chain[0]].length()))
    print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest_chain[0]].ends))
    # print(new_graph.bubble_chains[longest].list_chain())
    print("The bubble chains covered {}% nodes in the graph".format(
        (new_graph.nodes_in_chains()*100)/len(new_graph.nodes))
    )
    k_1 = new_graph.k - 1

    print("length of the graph is {}".format(len(new_graph)))
    print(k_1)
    total_seq = k_1
    for n in new_graph.nodes.values():
        n.visited = False
        total_seq += (len(n.seq) - k_1)

    sb_coverage = new_graph.k - 1
    for sb in new_graph.superbubbles.values():
        sb_coverage += sb.length(k=new_graph.k)

    print("The percentage of sequences covered by superbubbles is {}%".format(
        (sb_coverage * 100) / float(total_seq)
    ))
    # print(all_nodes)
    print(new_graph.chains_coverage(new_graph.k))
    print(total_seq)
    print("The percentage of sequences covered by chains is {}%\n".format(
        ((new_graph.chains_coverage(new_graph.k) + k_1)*100)/float(total_seq)
    ))


    # sb_coverage = 0
    # for sb in new_graph.superbubbles.values():
    #     sb_coverage += sb.length(k=k_1)
    #
    # print("The percentage of sequences covered by Superbubbles is {}%").format(
    #     (sb_coverage*100)/float(total_seq)
    # )

    if options.plot is not None:
        import matplotlib.pyplot as plt

        new_graph.plots(k=new_graph.k, output_dir=options.plot)

        if options.comp_plot:

            print("in components plot")

            functions.remove_chains(new_graph)
            new_graph.nodes_reset()
            # pdb.set_trace()

            con_comp = find_components.connected_components(new_graph)
            # print(con_comp)
            comp_dist = []
            for cc in con_comp:
                comp_dist.append(len(cc))

            # print("the comp_dist set is {}".format(set(comp_dist)))
            plt.hist(comp_dist)
            plt.title("Connected Components Distribution", fontsize=12)
            plt.xlabel("Size of Connected Component", fontsize=9)
            plt.ylabel("Frequency of Connected Component", fontsize=9)
            plt.yscale('log', nonposy='clip')
            output_file = os.path.join(options.plot, "components_dis_plot.png")
            plt.savefig(output_file, dpi=200)
            plt.clf()

            if options.plot_contigs_dist:

                contigs_dist = []
                for node in new_graph.nodes.values():
                    contigs_dist.append(node.seq_len)

                len_of_contigs = sum(contigs_dist)
                # checking how many nodes are bigger than 2*k
                big_contigs = [x for x in contigs_dist if x > 122]
                big_contigs_sum = sum(big_contigs)
                print("The nodes bigger than 2k in the nodes not in bubbles make up for {}% of all the contigs "
                      "not in the bubbles".format((big_contigs_sum*100)/len_of_contigs))

                # pdb.set_trace()

                plt.hist(contigs_dist)
                plt.title("Contigs distribution without bubbles", fontsize=12)
                plt.xlabel("Contig Length", fontsize=9)
                plt.ylabel("Frequency of Contig", fontsize=9)
                plt.yscale('log', nonposy='clip')
                output_file = os.path.join(options.plot, "contigs_dis_without_bubbles.png")

                plt.savefig(output_file, dpi=200)
                plt.clf()

    # outputting bubble chains
    if options.output_bubbles is not None:
        print("in output_bubbles")
        functions.write_bubbles(new_graph, options.output_bubbles)
        # print(new_graph)

    # debugging prints
    # for chain in new_graph.bubble_chains:
    #     print(chain.list_chain())


if options.output_components is not None:
    con_comp = find_components.connected_components(new_graph)
    biggest_com = (0, 0)
    for idx, cc in enumerate(con_comp):
        if len(cc) > biggest_com[1]:
            biggest_com = (idx, len(cc))

    print("the number of connected components is {}".format(len(con_comp)))
    print("The biggest component is of size {}".format(biggest_com[1]))

    # need to check arguments' stuff
    print("writing biggest component gfa file")
    k_1 = new_graph.k - 1
    functions.write_gfa(nodes=new_graph.nodes, list_of_nodes=con_comp[biggest_com[0]],
                        output_file=options.output_components, k=k_1)

total_t_end = time.time()
print("The total time for the whole script to run was {}".format(total_t_end - total_t_start))
	import os
	import sys
	import time
	# import copy
	from Graph import Graph
	from optparse import OptionParser
	import find_components
	import functions
	import compact_gfa
	import find_bubbles
	import find_superbubbles
	import pdb


	parser = OptionParser()

	parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str,
	help="Give the gfa file destination here")

	parser.add_option("-k", "--k_mer", action="store", dest="k_mer", default=None, type=int,
	help="Give the K as integer here")

	parser.add_option("--compact", action="store", dest="compact", type=str, default=None,
	help="This option will compact your graph and output the compacted one, "
	"give the output destination after this option")

	parser.add_option("--bfs", action="store_true", dest="bfs_start", default=False,
	help="This option will do BFS from the start node given with a neighborhood of size n"
	", give the following arguments <input_gfa> <start_node_id> <n> <output_file>")

	parser.add_option("--find_bubbles", action="store_true", dest="find_bubbles", default=False,
	help="This option will find bubble chains and print the statistics to terminal, "
	"give the following arguments <input_gfa> <k>")

	parser.add_option("--out_bubbles", action="store", dest="output_bubbles", default=None, type=str,
	help="Only be used after --bubbles, will output only bubble chains in a separate GFA file "
	"the output file name is given after it. e.g.: --out_bubbles <output_gfa>")

	parser.add_option("--plot", action="store", dest="plot", default=None, type=str,
	help="Only used after --bubbles, outputs statistics plots in the given directory, "
	"e.g.: --plot <output_dir>")

	parser.add_option("--components_plot", action="store_true", dest="comp_plot", default=False,
	help="Only used after --bubbles --plot, gives a histogram of the distribution of connected"
	" components in the graph. e.g.: --components_plot <output_file>")

	parser.add_option("--plot_contigs_nobub", action="store_true", dest="plot_contigs_dist", default=False,
	help="Only used after --bubbles and --components_plot, this will output the contigs "
	"distribution of the graph after removing the bubble chains."
	" e.g.: --plot_contigs_nobub <output_file_path.png>")

	parser.add_option("--output_component", action="store", dest="output_components", default=None, type=str,
	help="If used after --bubbles, then you only need to give it one argument <output_gfa>"
	" .If it was used alone then you need to give <input_gfa> <k> as raw argument to main. "
	"Example: python main.py <input_gfa> <k> --output_component <output_gfa>")

	total_t_start = time.time()
	(options, args) = parser.parse_args()

	if options.in_graph is not None:
	if os.path.exists(options.in_graph):
	start = time.time()
	new_graph = Graph()
	print("Reading graph...")
	new_graph.read_gfa(options.in_graph)
	print("length of the graph is {}".format(len(new_graph)))
	end = time.time()
	print("Time took to read the file is {} seconds".format(end - start))

	if options.k_mer is not None:
	new_graph.k = options.k_mer
	else:
	print("You didn't give the value of k for the k-mers")
	parser.print_help()
	sys.exit()
	else:
	print("Could not find the input GFA file, check the path")
	parser.print_help()
	sys.exit()
	else:
	parser.print_help()
	sys.exit()

	if options.compact is not None:
	# new_graph = Graph()
	# new_graph.read_gfa(args[1])
	# new_graph.k = int(args[2])
	# k_1 = new_graph.k -1
	new_graph.nodes = compact_gfa.compact_graph(new_graph.nodes, new_graph.k)
	functions.write_gfa(nodes=new_graph.nodes, k=new_graph.k, output_file=options.compact)

	#
	# if len(sys.argv) == 1:
	# print("No options or arguments given\n")
	# parser.print_help()
	# sys.exit(0)

	# new_graph = Graph()

	# I might want to change this later that I only read the file inside the option I want to do
	# if args[0].endswith("gfa"):
	# if os.path.isfile(args[0]):
	# print("Reading graph...")
	# new_graph.read_gfa(args[0])
	# new_graph.k = int(args[1]) - 1

	if options.find_bubbles:

	print("Now finding Superbubbles")
	# new_graph.nodes_reset()
	find_superbubbles.find_superbubbles(new_graph, simple_bub=False)
	print("Finished finding Superbubbles\n")

	print("Calling find bubbles")
	# new_graph.find_bubbles()
	find_bubbles.find_bubbles(new_graph)

	print("Done finding bubbles")

	functions.merge_sb_with_bchains(new_graph)

	print("The number of singular bubbles is {}".format(new_graph.num_single_bubbles()))
	# small test
	# all_nodes = []
	# for chain in new_graph.bubble_chains.values():
	# all_nodes += chain.list_chain()

	# for sb in new_graph.superbubbles.values():
	# all_nodes += sb.list_superbubble()
	# print("all the nodes are {}".format(len(all_nodes)))
	print("The graph has {} nodes".format(len(new_graph.nodes)))
	# end of test
	longest = new_graph.max_chain()
	longest_chain = new_graph.longest_chain(new_graph.k)
	# print(longest_chain)
	print("Longest chain bubble-wise has {} bubbles and length"
	" of {}\n".format(longest[1], new_graph.bubble_chains[longest[0]].chain_length(new_graph.k)))
	print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest[0]].ends))

	print("Longest chain sequence-wise has {} bp and"
	" {} bubbles\n".format(longest_chain[1], new_graph.bubble_chains[longest_chain[0]].length()))
	print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest_chain[0]].ends))
	# print(new_graph.bubble_chains[longest].list_chain())
	print("The bubble chains covered {}% nodes in the graph".format(
	(new_graph.nodes_in_chains()*100)/len(new_graph.nodes))
	)
	k_1 = new_graph.k - 1

	print("length of the graph is {}".format(len(new_graph)))
	print(k_1)
	total_seq = k_1
	for n in new_graph.nodes.values():
	n.visited = False
	total_seq += (len(n.seq) - k_1)

	sb_coverage = new_graph.k - 1
	for sb in new_graph.superbubbles.values():
	sb_coverage += sb.length(k=new_graph.k)

	print("The percentage of sequences covered by superbubbles is {}%".format(
	(sb_coverage * 100) / float(total_seq)
	))
	# print(all_nodes)
	print(new_graph.chains_coverage(new_graph.k))
	print(total_seq)
	print("The percentage of sequences covered by chains is {}%\n".format(
	((new_graph.chains_coverage(new_graph.k) + k_1)*100)/float(total_seq)
	))



	# sb_coverage = 0
	# for sb in new_graph.superbubbles.values():
	# sb_coverage += sb.length(k=k_1)
	#
	# print("The percentage of sequences covered by Superbubbles is {}%").format(
	# (sb_coverage*100)/float(total_seq)
	# )

	if options.plot is not None:
	import matplotlib.pyplot as plt

	new_graph.plots(k=new_graph.k, output_dir=options.plot)

	if options.comp_plot:

	print("in components plot")

	functions.remove_chains(new_graph)
	new_graph.nodes_reset()
	# pdb.set_trace()

	con_comp = find_components.connected_components(new_graph)
	# print(con_comp)
	comp_dist = []
	for cc in con_comp:
	comp_dist.append(len(cc))

	# print("the comp_dist set is {}".format(set(comp_dist)))
	plt.hist(comp_dist)
	plt.title("Connected Components Distribution", fontsize=12)
	plt.xlabel("Size of Connected Component", fontsize=9)
	plt.ylabel("Frequency of Connected Component", fontsize=9)
	plt.yscale('log', nonposy='clip')
	output_file = os.path.join(options.plot, "components_dis_plot.png")
	plt.savefig(output_file, dpi=200)
	plt.clf()

	if options.plot_contigs_dist:

	contigs_dist = []
	for node in new_graph.nodes.values():
	contigs_dist.append(node.seq_len)

	len_of_contigs = sum(contigs_dist)
	# checking how many nodes are bigger than 2*k
	big_contigs = [x for x in contigs_dist if x > 122]
	big_contigs_sum = sum(big_contigs)
	print("The nodes bigger than 2k in the nodes not in bubbles make up for {}% of all the contigs "
	"not in the bubbles".format((big_contigs_sum*100)/len_of_contigs))

	# pdb.set_trace()

	plt.hist(contigs_dist)
	plt.title("Contigs distribution without bubbles", fontsize=12)
	plt.xlabel("Contig Length", fontsize=9)
	plt.ylabel("Frequency of Contig", fontsize=9)
	plt.yscale('log', nonposy='clip')
	output_file = os.path.join(options.plot, "contigs_dis_without_bubbles.png")

	plt.savefig(output_file, dpi=200)
	plt.clf()

	# outputting bubble chains
	if options.output_bubbles is not None:
	print("in output_bubbles")
	functions.write_bubbles(new_graph, options.output_bubbles)
	# print(new_graph)

	# debugging prints
	# for chain in new_graph.bubble_chains:
	# print(chain.list_chain())


	if options.output_components is not None:
	con_comp = find_components.connected_components(new_graph)
	biggest_com = (0, 0)
	for idx, cc in enumerate(con_comp):
	if len(cc) > biggest_com[1]:
	biggest_com = (idx, len(cc))

	print("the number of connected components is {}".format(len(con_comp)))
	print("The biggest component is of size {}".format(biggest_com[1]))

	# need to check arguments' stuff
	print("writing biggest component gfa file")
	k_1 = new_graph.k - 1
	functions.write_gfa(nodes=new_graph.nodes, list_of_nodes=con_comp[biggest_com[0]],
	output_file=options.output_components, k=k_1)

	total_t_end = time.time()
	print("The total time for the whole script to run was {}".format(total_t_end - total_t_start))