This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
thesis_tm/main.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
262 lines (209 sloc)
10.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import time | |
# import copy | |
from Graph import Graph | |
from optparse import OptionParser | |
import find_components | |
import functions | |
import compact_gfa | |
import find_bubbles | |
import find_superbubbles | |
import pdb | |
parser = OptionParser() | |
parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str, | |
help="Give the gfa file destination here") | |
parser.add_option("-k", "--k_mer", action="store", dest="k_mer", default=None, type=int, | |
help="Give the K as integer here") | |
parser.add_option("--compact", action="store", dest="compact", type=str, default=None, | |
help="This option will compact your graph and output the compacted one, " | |
"give the output destination after this option") | |
parser.add_option("--bfs", action="store_true", dest="bfs_start", default=False, | |
help="This option will do BFS from the start node given with a neighborhood of size n" | |
", give the following arguments <input_gfa> <start_node_id> <n> <output_file>") | |
parser.add_option("--find_bubbles", action="store_true", dest="find_bubbles", default=False, | |
help="This option will find bubble chains and print the statistics to terminal, " | |
"give the following arguments <input_gfa> <k>") | |
parser.add_option("--out_bubbles", action="store", dest="output_bubbles", default=None, type=str, | |
help="Only be used after --bubbles, will output only bubble chains in a separate GFA file " | |
"the output file name is given after it. e.g.: --out_bubbles <output_gfa>") | |
parser.add_option("--plot", action="store", dest="plot", default=None, type=str, | |
help="Only used after --bubbles, outputs statistics plots in the given directory, " | |
"e.g.: --plot <output_dir>") | |
parser.add_option("--components_plot", action="store_true", dest="comp_plot", default=False, | |
help="Only used after --bubbles --plot, gives a histogram of the distribution of connected" | |
" components in the graph. e.g.: --components_plot <output_file>") | |
parser.add_option("--plot_contigs_nobub", action="store_true", dest="plot_contigs_dist", default=False, | |
help="Only used after --bubbles and --components_plot, this will output the contigs " | |
"distribution of the graph after removing the bubble chains." | |
" e.g.: --plot_contigs_nobub <output_file_path.png>") | |
parser.add_option("--output_component", action="store", dest="output_components", default=None, type=str, | |
help="If used after --bubbles, then you only need to give it one argument <output_gfa>" | |
" .If it was used alone then you need to give <input_gfa> <k> as raw argument to main. " | |
"Example: python main.py <input_gfa> <k> --output_component <output_gfa>") | |
total_t_start = time.time() | |
(options, args) = parser.parse_args() | |
if options.in_graph is not None: | |
if os.path.exists(options.in_graph): | |
start = time.time() | |
new_graph = Graph() | |
print("Reading graph...") | |
new_graph.read_gfa(options.in_graph) | |
print("length of the graph is {}".format(len(new_graph))) | |
end = time.time() | |
print("Time took to read the file is {} seconds".format(end - start)) | |
if options.k_mer is not None: | |
new_graph.k = options.k_mer | |
else: | |
print("You didn't give the value of k for the k-mers") | |
parser.print_help() | |
sys.exit() | |
else: | |
print("Could not find the input GFA file, check the path") | |
parser.print_help() | |
sys.exit() | |
else: | |
parser.print_help() | |
sys.exit() | |
if options.compact is not None: | |
# new_graph = Graph() | |
# new_graph.read_gfa(args[1]) | |
# new_graph.k = int(args[2]) | |
# k_1 = new_graph.k -1 | |
new_graph.nodes = compact_gfa.compact_graph(new_graph.nodes, new_graph.k) | |
functions.write_gfa(nodes=new_graph.nodes, k=new_graph.k, output_file=options.compact) | |
# | |
# if len(sys.argv) == 1: | |
# print("No options or arguments given\n") | |
# parser.print_help() | |
# sys.exit(0) | |
# new_graph = Graph() | |
# I might want to change this later that I only read the file inside the option I want to do | |
# if args[0].endswith("gfa"): | |
# if os.path.isfile(args[0]): | |
# print("Reading graph...") | |
# new_graph.read_gfa(args[0]) | |
# new_graph.k = int(args[1]) - 1 | |
if options.find_bubbles: | |
print("Now finding Superbubbles") | |
# new_graph.nodes_reset() | |
find_superbubbles.find_superbubbles(new_graph, simple_bub=False) | |
print("Finished finding Superbubbles\n") | |
print("Calling find bubbles") | |
# new_graph.find_bubbles() | |
find_bubbles.find_bubbles(new_graph) | |
print("Done finding bubbles") | |
functions.merge_sb_with_bchains(new_graph) | |
print("The number of singular bubbles is {}".format(new_graph.num_single_bubbles())) | |
# small test | |
# all_nodes = [] | |
# for chain in new_graph.bubble_chains.values(): | |
# all_nodes += chain.list_chain() | |
# for sb in new_graph.superbubbles.values(): | |
# all_nodes += sb.list_superbubble() | |
# print("all the nodes are {}".format(len(all_nodes))) | |
print("The graph has {} nodes".format(len(new_graph.nodes))) | |
# end of test | |
longest = new_graph.max_chain() | |
longest_chain = new_graph.longest_chain(new_graph.k) | |
# print(longest_chain) | |
print("Longest chain bubble-wise has {} bubbles and length" | |
" of {}\n".format(longest[1], new_graph.bubble_chains[longest[0]].chain_length(new_graph.k))) | |
print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest[0]].ends)) | |
print("Longest chain sequence-wise has {} bp and" | |
" {} bubbles\n".format(longest_chain[1], new_graph.bubble_chains[longest_chain[0]].length())) | |
print("And the ends of the longest chains are {}".format(new_graph.bubble_chains[longest_chain[0]].ends)) | |
# print(new_graph.bubble_chains[longest].list_chain()) | |
print("The bubble chains covered {}% nodes in the graph".format( | |
(new_graph.nodes_in_chains()*100)/len(new_graph.nodes)) | |
) | |
k_1 = new_graph.k - 1 | |
print("length of the graph is {}".format(len(new_graph))) | |
print(k_1) | |
total_seq = k_1 | |
for n in new_graph.nodes.values(): | |
n.visited = False | |
total_seq += (len(n.seq) - k_1) | |
sb_coverage = new_graph.k - 1 | |
for sb in new_graph.superbubbles.values(): | |
sb_coverage += sb.length(k=new_graph.k) | |
print("The percentage of sequences covered by superbubbles is {}%".format( | |
(sb_coverage * 100) / float(total_seq) | |
)) | |
# print(all_nodes) | |
print(new_graph.chains_coverage(new_graph.k)) | |
print(total_seq) | |
print("The percentage of sequences covered by chains is {}%\n".format( | |
((new_graph.chains_coverage(new_graph.k) + k_1)*100)/float(total_seq) | |
)) | |
# sb_coverage = 0 | |
# for sb in new_graph.superbubbles.values(): | |
# sb_coverage += sb.length(k=k_1) | |
# | |
# print("The percentage of sequences covered by Superbubbles is {}%").format( | |
# (sb_coverage*100)/float(total_seq) | |
# ) | |
if options.plot is not None: | |
import matplotlib.pyplot as plt | |
new_graph.plots(k=new_graph.k, output_dir=options.plot) | |
if options.comp_plot: | |
print("in components plot") | |
functions.remove_chains(new_graph) | |
new_graph.nodes_reset() | |
# pdb.set_trace() | |
con_comp = find_components.connected_components(new_graph) | |
# print(con_comp) | |
comp_dist = [] | |
for cc in con_comp: | |
comp_dist.append(len(cc)) | |
# print("the comp_dist set is {}".format(set(comp_dist))) | |
plt.hist(comp_dist) | |
plt.title("Connected Components Distribution", fontsize=12) | |
plt.xlabel("Size of Connected Component", fontsize=9) | |
plt.ylabel("Frequency of Connected Component", fontsize=9) | |
plt.yscale('log', nonposy='clip') | |
output_file = os.path.join(options.plot, "components_dis_plot.png") | |
plt.savefig(output_file, dpi=200) | |
plt.clf() | |
if options.plot_contigs_dist: | |
contigs_dist = [] | |
for node in new_graph.nodes.values(): | |
contigs_dist.append(node.seq_len) | |
len_of_contigs = sum(contigs_dist) | |
# checking how many nodes are bigger than 2*k | |
big_contigs = [x for x in contigs_dist if x > 122] | |
big_contigs_sum = sum(big_contigs) | |
print("The nodes bigger than 2k in the nodes not in bubbles make up for {}% of all the contigs " | |
"not in the bubbles".format((big_contigs_sum*100)/len_of_contigs)) | |
# pdb.set_trace() | |
plt.hist(contigs_dist) | |
plt.title("Contigs distribution without bubbles", fontsize=12) | |
plt.xlabel("Contig Length", fontsize=9) | |
plt.ylabel("Frequency of Contig", fontsize=9) | |
plt.yscale('log', nonposy='clip') | |
output_file = os.path.join(options.plot, "contigs_dis_without_bubbles.png") | |
plt.savefig(output_file, dpi=200) | |
plt.clf() | |
# outputting bubble chains | |
if options.output_bubbles is not None: | |
print("in output_bubbles") | |
functions.write_bubbles(new_graph, options.output_bubbles) | |
# print(new_graph) | |
# debugging prints | |
# for chain in new_graph.bubble_chains: | |
# print(chain.list_chain()) | |
if options.output_components is not None: | |
con_comp = find_components.connected_components(new_graph) | |
biggest_com = (0, 0) | |
for idx, cc in enumerate(con_comp): | |
if len(cc) > biggest_com[1]: | |
biggest_com = (idx, len(cc)) | |
print("the number of connected components is {}".format(len(con_comp))) | |
print("The biggest component is of size {}".format(biggest_com[1])) | |
# need to check arguments' stuff | |
print("writing biggest component gfa file") | |
k_1 = new_graph.k - 1 | |
functions.write_gfa(nodes=new_graph.nodes, list_of_nodes=con_comp[biggest_com[0]], | |
output_file=options.output_components, k=k_1) | |
total_t_end = time.time() | |
print("The total time for the whole script to run was {}".format(total_t_end - total_t_start)) |