Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
4fad16beb9
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
181 lines (152 sloc) 6.03 KB
# this is to produce the plots for the thesis
# num of bubbles (total, bubbles that have the same size)
# distributions
# N50
import sys
import os
import time
import editdistance
import Graph
import find_bubbles
import find_superbubbles
from functions import merge_sb_with_bchains
import matplotlib.pyplot as plt
if len(sys.argv) < 4:
print("You need to give <in_gfa> <plots_directory> <k>")
sys.exit()
graph_name = sys.argv[1]
graph = Graph.Graph()
print("reading graph {}".format(graph_name))
graph.read_gfa(graph_name)
graph.k = int(sys.argv[3])
k = graph.k
node_dist = []
for node in graph.nodes.values():
node_dist.append(node.seq_len)
plt.hist(node_dist, bins=int(len(set(node_dist))/2))
plt.title("Nodes Length Distribution", fontsize=12)
plt.xlabel("Size of Nodes", fontsize=9)
plt.ylabel("Frequency of Node's size", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(sys.argv[2],"nodes_len_dist.png")
plt.savefig(output_file, dpi=200)
plt.clf()
start = time.time()
print("Looking for superbubbles")
find_superbubbles.find_superbubbles(graph)
end = time.time()
print("There are {} superbubbles".format(len(graph.superbubbles)))
print("The time it took for superbubbles is {} seconds".format(end-start))
sb_len_dist = []
for sb in graph.superbubbles.values():
sb_len_dist.append(len(sb.inside) + 2)
plt.hist(sb_len_dist, bins=int(len(set(sb_len_dist))/2))
plt.title("Superbubble Size Distribution", fontsize=12)
plt.xlabel("Size of Superbubble", fontsize=9)
plt.ylabel("Frequency of Superbubble sizes", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(sys.argv[2],"sb_len_dist.png")
plt.savefig(output_file, dpi=200)
plt.clf()
start = time.time()
find_bubbles.find_bubbles(graph)
end = time.time()
print("The time it took for bubbles is {} seconds".format(end-start))
count_bubbles = 0
count_equal_bubbles = 0
bchain_len_dist = []
chain_seq_len_dist = []
for chain in graph.bubble_chains.values():
for b in chain.bubbles["bubbles"]:
if b.equal_branches():
count_equal_bubbles += 1
count_bubbles += len(chain.bubbles["bubbles"])
bchain_len_dist.append(len(chain.bubbles["bubbles"]))
chain_seq_len_dist.append(chain.chain_length(k))
print("The total number of simple bubbles is {}".format(count_bubbles))
print("The number of simple bubbles with equal branches is {}".format(count_equal_bubbles))
print("The longest bubble chains is {}".format(max(bchain_len_dist)))
plt.hist(bchain_len_dist, bins=int(len(set(bchain_len_dist))/2))
plt.title("Bubble chains Size Distribution", fontsize=12)
plt.xlabel("Size of Bubble chains", fontsize=9)
plt.ylabel("Frequency of Bubble chain sizes", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(sys.argv[2],"bchain_len_dist.png")
plt.savefig(output_file, dpi=200)
plt.clf()
plt.hist(chain_seq_len_dist, bins=int(len(set(chain_seq_len_dist))/2))
plt.title("Bubble chains Length Distribution", fontsize=12)
plt.xlabel("Length of Bubble chains", fontsize=9)
plt.ylabel("Frequency of Bubble chain Lengths", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(sys.argv[2],"chain_seq_len_dist.png")
plt.savefig(output_file, dpi=200)
plt.clf()
edit_dist = []
for chain in graph.bubble_chains.values():
for bubble in chain.bubbles["bubbles"]:
x = bubble.edit_distance()
if x is not None:
edit_dist.append(x)
hamming_bubbles = []
for chain in graph.bubble_chains.values():
for bubble in chain.bubbles["bubbles"]:
x = bubble.hamming_dist()
if x is not None:
hamming_bubbles.append(x)
plt.hist(edit_dist, bins=int(len(set(edit_dist))*2))
plt.title("Edit Distance Bubble Distribution", fontsize=12)
plt.xlabel("Edit Distance", fontsize=9)
plt.ylabel("Frequency of bubbles", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(sys.argv[2],"edit_dist_distribution.png")
plt.savefig(output_file, dpi=200)
plt.clf()
plt.hist(hamming_bubbles, bins=int(len(set(hamming_bubbles))*2))
plt.title("Hamming Distance Bubble Distribution", fontsize=12)
plt.xlabel("Hamming Distance", fontsize=9)
plt.ylabel("Frequency of bubbles", fontsize=9)
plt.yscale('log', nonposy='clip')
output_file = os.path.join(sys.argv[2],"hamming_dist_distribution.png")
plt.savefig(output_file, dpi=200)
plt.clf()
zipped = list(zip(chain_seq_len_dist, bchain_len_dist))
zipped.sort(reverse=True)
seqlen_dist = [x[0] for x in zipped]
bubble_dist = [x[1] for x in zipped]
plt.subplot(211)
plt.plot(seqlen_dist)
plt.title("Chains sequence lengths", fontsize=12)
plt.subplot(212)
plt.plot(bubble_dist)
plt.xlabel("Chains bubble size", fontsize=12)
output_file = os.path.join(sys.argv[2], "bubbles_and_seq.png")
plt.savefig(output_file, dpi=250)
plt.clf()
current_len = 0
previous_len = len(graph.bubble_chains)
print("before merging there are {} bubble chains".format(len(graph.bubble_chains)))
while current_len != previous_len:
previous_len = len(graph.bubble_chains)
merge_sb_with_bchains(graph)
current_len = len(graph.bubble_chains)
print("after merging there are {} bubble chains".format(len(graph.bubble_chains)))
print("after merging there are {} superbubbles".format(len(graph.superbubbles)))
total_seq = 0
for n in graph.nodes.values():
total_seq += n.seq_len
longest = graph.max_chain()
longest_chain = graph.longest_chain(graph.k)
print("longest chain bubble-wise has {} bubbles and length of {} \n".format(longest[1], graph.bubble_chains[longest[0]].chain_length(graph.k)))
print("longest chain sequence-wise has {} bp and {} bubbles \n".format(longest_chain[1], graph.bubble_chains[longest_chain[0]].chain_length(graph.k)))
print("The bubble chains covered {}% nodes in the graph".format(
(graph.nodes_in_chains()*100)/len(graph.nodes)))
k_1 = graph.k -1
print("the percentage of sequences covered by chains is {}%\n".format(
((graph.chains_coverage(graph.k) + k_1)*100)/float(total_seq)))
sb_coverage = graph.k -1
for sb in graph.superbubbles.values():
sb_coverage += sb.length(k=graph.k)
print("the percentage of sequences covered by superbubbles is {}\n".format(
(sb_coverage * 100) / float(total_seq)))
# superbubble object has source, sink, inside to count the nodes