This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
thesis_tm/plots_for_thesis_only_bubbles.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
187 lines (156 sloc)
6.08 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is to produce the plots for the thesis | |
# num of bubbles (total, bubbles that have the same size) | |
# distributions | |
# N50 | |
import sys | |
import os | |
import time | |
import editdistance | |
import Graph | |
import find_bubbles | |
import find_superbubbles | |
from functions import merge_sb_with_bchains | |
import matplotlib.pyplot as plt | |
if len(sys.argv) < 4: | |
print("You need to give <in_gfa> <plots_directory> <k>") | |
sys.exit() | |
graph_name = sys.argv[1] | |
graph = Graph.Graph() | |
print("reading graph {}".format(graph_name)) | |
graph.read_gfa(graph_name) | |
graph.k = int(sys.argv[3]) | |
k = graph.k | |
node_dist = [] | |
for node in graph.nodes.values(): | |
node_dist.append(node.seq_len) | |
plt.hist(node_dist, bins=int(len(set(node_dist))/2)) | |
plt.title("Nodes Length Distribution", fontsize=12) | |
plt.xlabel("Size of Nodes", fontsize=9) | |
plt.ylabel("Frequency of Node's size", fontsize=9) | |
plt.yscale('log', nonposy='clip') | |
output_file = os.path.join(sys.argv[2],"nodes_len_dist.png") | |
plt.savefig(output_file, dpi=200) | |
plt.clf() | |
# start = time.time() | |
# print("Looking for superbubbles") | |
# find_superbubbles.find_superbubbles(graph) | |
# end = time.time() | |
# print("There are {} superbubbles".format(len(graph.superbubbles))) | |
# print("The time it took for superbubbles is {} seconds".format(end-start)) | |
# sb_len_dist = [] | |
# for sb in graph.superbubbles.values(): | |
# sb_len_dist.append(len(sb.inside) + 2) | |
# plt.hist(sb_len_dist, bins=int(len(set(sb_len_dist))/2)) | |
# plt.title("Superbubble Size Distribution", fontsize=12) | |
# plt.xlabel("Size of Superbubble", fontsize=9) | |
# plt.ylabel("Frequency of Superbubble sizes", fontsize=9) | |
# plt.yscale('log', nonposy='clip') | |
# output_file = os.path.join(sys.argv[2],"sb_len_dist.png") | |
# plt.savefig(output_file, dpi=200) | |
# plt.clf() | |
start = time.time() | |
find_bubbles.find_bubbles(graph) | |
end = time.time() | |
print("The time it took for bubbles is {} seconds".format(end-start)) | |
count_bubbles = 0 | |
count_equal_bubbles = 0 | |
bchain_len_dist = [] | |
chain_seq_len_dist = [] | |
for chain in graph.bubble_chains.values(): | |
for b in chain.bubbles["bubbles"]: | |
if b.equal_branches(): | |
count_equal_bubbles += 1 | |
count_bubbles += len(chain.bubbles["bubbles"]) | |
bchain_len_dist.append(len(chain.bubbles["bubbles"])) | |
chain_seq_len_dist.append(chain.chain_length(k)) | |
print("The total number of simple bubbles is {}".format(count_bubbles)) | |
print("The number of simple bubbles with equal branches is {}".format(count_equal_bubbles)) | |
print("The longest bubble chains is {}".format(max(bchain_len_dist))) | |
''' | |
plt.hist(bchain_len_dist, bins=int(len(set(bchain_len_dist))/2)) | |
plt.title("Bubble chains Size Distribution", fontsize=12) | |
plt.xlabel("Size of Bubble chains", fontsize=9) | |
plt.ylabel("Frequency of Bubble chain sizes", fontsize=9) | |
plt.yscale('log', nonposy='clip') | |
output_file = os.path.join(sys.argv[2],"bchain_len_dist.png") | |
plt.savefig(output_file, dpi=200) | |
plt.clf() | |
plt.hist(chain_seq_len_dist, bins=int(len(set(chain_seq_len_dist))/2)) | |
plt.title("Bubble chains Length Distribution", fontsize=12) | |
plt.xlabel("Length of Bubble chains", fontsize=9) | |
plt.ylabel("Frequency of Bubble chain Lengths", fontsize=9) | |
plt.yscale('log', nonposy='clip') | |
output_file = os.path.join(sys.argv[2],"chain_seq_len_dist.png") | |
plt.savefig(output_file, dpi=200) | |
plt.clf() | |
edit_dist = [] | |
for chain in graph.bubble_chains.values(): | |
for bubble in chain.bubbles["bubbles"]: | |
x = bubble.edit_distance() | |
if x is not None: | |
edit_dist.append(x) | |
hamming_bubbles = [] | |
for chain in graph.bubble_chains.values(): | |
for bubble in chain.bubbles["bubbles"]: | |
x = bubble.hamming_dist() | |
if x is not None: | |
hamming_bubbles.append(x) | |
plt.hist(edit_dist, bins=int(len(set(edit_dist))*2)) | |
plt.title("Edit Distance Bubble Distribution", fontsize=12) | |
plt.xlabel("Edit Distance", fontsize=9) | |
plt.ylabel("Frequency of bubbles", fontsize=9) | |
plt.yscale('log', nonposy='clip') | |
output_file = os.path.join(sys.argv[2],"edit_dist_distribution.png") | |
plt.savefig(output_file, dpi=200) | |
plt.clf() | |
plt.hist(hamming_bubbles, bins=int(len(set(hamming_bubbles))*2)) | |
plt.title("Hamming Distance Bubble Distribution", fontsize=12) | |
plt.xlabel("Hamming Distance", fontsize=9) | |
plt.ylabel("Frequency of bubbles", fontsize=9) | |
plt.yscale('log', nonposy='clip') | |
output_file = os.path.join(sys.argv[2],"hamming_dist_distribution.png") | |
plt.savefig(output_file, dpi=200) | |
plt.clf() | |
zipped = list(zip(chain_seq_len_dist, bchain_len_dist)) | |
zipped.sort(reverse=True) | |
seqlen_dist = [x[0] for x in zipped] | |
bubble_dist = [x[1] for x in zipped] | |
plt.subplot(211) | |
plt.plot(seqlen_dist) | |
plt.title("Chains sequence lengths", fontsize=12) | |
plt.subplot(212) | |
plt.plot(bubble_dist) | |
plt.xlabel("Chains bubble size", fontsize=12) | |
output_file = os.path.join(sys.argv[2], "bubbles_and_seq.png") | |
plt.savefig(output_file, dpi=250) | |
plt.clf() | |
''' | |
''' | |
current_len = 0 | |
previous_len = len(graph.bubble_chains) | |
print("before merging there are {} bubble chains".format(len(graph.bubble_chains))) | |
while current_len != previous_len: | |
previous_len = len(graph.bubble_chains) | |
merge_sb_with_bchains(graph) | |
current_len = len(graph.bubble_chains) | |
print("after merging there are {} bubble chains".format(len(graph.bubble_chains))) | |
print("after merging there are {} superbubbles".format(len(graph.superbubbles))) | |
total_seq = 0 | |
for n in graph.nodes.values(): | |
total_seq += n.seq_len | |
longest = graph.max_chain() | |
longest_chain = graph.longest_chain(graph.k) | |
print("longest chain bubble-wise has {} bubbles and length of {} \n".format(longest[1], graph.bubble_chains[longest[0]].chain_length(graph.k))) | |
print("longest chain sequence-wise has {} bp and {} bubbles \n".format(longest_chain[1], graph.bubble_chains[longest_chain[0]].chain_length(graph.k))) | |
print("The bubble chains covered {}% nodes in the graph".format( | |
(graph.nodes_in_chains()*100)/len(graph.nodes))) | |
k_1 = graph.k -1 | |
print("the percentage of sequences covered by chains is {}%\n".format( | |
((graph.chains_coverage(graph.k) + k_1)*100)/float(total_seq))) | |
sb_coverage = graph.k -1 | |
for sb in graph.superbubbles.values(): | |
sb_coverage += sb.length(k=graph.k) | |
print("the percentage of sequences covered by superbubbles is {}\n".format( | |
(sb_coverage * 100) / float(total_seq))) | |
# superbubble object has source, sink, inside to count the nodes | |
''' | |