This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
thesis_tm/output_aligned_reads_dict.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
272 lines (235 sloc)
9.35 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import stream | |
import pickle | |
import time | |
from vg_pb2 import Alignment | |
class Node: | |
def __init__(self, identifier): | |
self.id = identifier | |
self.seq = "" | |
self.seq_len = 0 | |
self.start = [] | |
self.end = [] | |
self.visited = False | |
self.which_chain = 0 | |
self.which_sb = 0 | |
self.which_b = 0 | |
self.which_allele = -1 | |
def read_gfa(gfa_file_path, modified=False): | |
""" | |
:param gfa_file_path: gfa graph file. | |
:param modified: if I'm reading my modified GFA with extra information for the nodes | |
:return: Dictionary of node ids and Node objects. | |
""" | |
if not os.path.exists(gfa_file_path): | |
print("the gfa file path you gave does not exists, please try again!") | |
sys.exit() | |
bubble_chains = dict() | |
nodes = dict() | |
edges = [] | |
with open(gfa_file_path, "r") as lines: | |
for line in lines: | |
if line.startswith("S"): | |
if modified: | |
line = line.split("\t") | |
n_id = int(line[1]) | |
nodes[n_id] = Node(n_id) | |
nodes[n_id].seq_len = len(line[2]) | |
nodes[n_id].seq = str(line[2]) | |
# this is my extra columns | |
# constructed as which_chain:which_sb:which_b:which_allele | |
# e.g. 5:0:3:1 (chain 5, not part of a superbubble, bubble 3, allele 1) | |
specifications = str(line[3]) | |
specifications = specifications.split(":") | |
nodes[n_id].which_chain = int(specifications[0]) | |
nodes[n_id].which_sb = int(specifications[1]) | |
nodes[n_id].which_b = int(specifications[2]) | |
nodes[n_id].which_allele = int(specifications[3]) | |
if nodes[n_id].which_chain not in bubble_chains: | |
bubble_chains[nodes[n_id].which_chain] = {nodes[n_id].which_b: [n_id]} | |
else: | |
if nodes[n_id].which_b not in bubble_chains[nodes[n_id].which_chain]: | |
bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b] = [n_id] | |
else: | |
bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b].append(n_id) | |
else: | |
line = line.split() | |
n_id = int(line[1]) | |
n_len = len(line[2]) | |
nodes[n_id] = Node(n_id) | |
nodes[n_id].seq_len = n_len | |
nodes[n_id].seq = str(line[2]) | |
elif line.startswith("L"): | |
continue | |
# for e in edges: | |
# line = e.split() | |
# | |
# k = int(line[1]) | |
# neighbor = int(line[3]) | |
# if (k not in nodes) or (neighbor not in nodes): | |
# continue | |
# | |
# if line[2] == "-": | |
# from_start = True | |
# else: | |
# from_start = False | |
# | |
# if line[4] == "-": | |
# to_end = True | |
# else: | |
# to_end = False | |
# | |
# if from_start is True and to_end is True: # from start to end L x - y - | |
# if (neighbor, 1) not in nodes[k].start: | |
# nodes[k].start.append((neighbor, 1)) | |
# if (k, 0) not in nodes[neighbor].end: | |
# nodes[neighbor].end.append((k, 0)) | |
# | |
# elif from_start is True and to_end is False: # from start to start L x - y + | |
# | |
# if (neighbor, 0) not in nodes[k].start: | |
# nodes[k].start.append((neighbor, 0)) | |
# | |
# if (k, 0) not in nodes[neighbor].start: | |
# nodes[neighbor].start.append((k, 0)) | |
# | |
# elif from_start is False and to_end is False: # from end to start L x + y + | |
# if (neighbor, 0) not in nodes[k].end: | |
# nodes[k].end.append((neighbor, 0)) | |
# | |
# if (k, 1) not in nodes[neighbor].start: | |
# nodes[neighbor].start.append((k, 1)) | |
# | |
# elif from_start is False and to_end is True: # from end to end L x + y - | |
# if (neighbor, 1) not in nodes[k].end: | |
# nodes[k].end.append((neighbor, 1)) | |
# | |
# if (k, 1) not in nodes[neighbor].end: | |
# nodes[neighbor].end.append((k, 1)) | |
return nodes, bubble_chains | |
def build_reads(nodes, gam_file_path): | |
""" | |
:param nodes: dictionary of nodes objects | |
:param gam_file_path: gam file path | |
:return: returns a dictionary of readsets, where each read set object represent a bubble chain | |
""" | |
with stream.open(str(gam_file_path), "rb") as instream: | |
chains = dict() | |
counter = 0 | |
for data in instream: | |
counter += 1 | |
if (counter % 1500000) == 0: | |
print("Processed {} reads".format(counter)) | |
g = Alignment() | |
g.ParseFromString(data) | |
# construct read object | |
# read = Read(g.name) # leave other values as default | |
# if g.name not in reads: | |
# reads[g.name] = dict() | |
for m in g.path.mapping: | |
n_id = int(m.position.node_id) | |
if n_id not in nodes: # in case one of the nodes wasn't in the graph | |
continue | |
if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1): | |
if nodes[n_id].which_chain not in chains: | |
chains[nodes[n_id].which_chain] = dict() | |
chains[nodes[n_id].which_chain][g.name] = [n_id] | |
else: | |
if g.name not in chains[nodes[n_id].which_chain]: | |
chains[nodes[n_id].which_chain][g.name] = [n_id] | |
else: | |
chains[nodes[n_id].which_chain][g.name].append(n_id) | |
# if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1): | |
# if nodes[n_id].which_chain in reads[g.name]: | |
# reads[g.name][nodes[n_id].which_chain].append(n_id) | |
# else: | |
# reads[g.name][nodes[n_id].which_chain] = [n_id] | |
return chains | |
def dump_pickled_dict(reads, file_name='aligned_reads_dict.pickle'): | |
with open(file_name, 'wb') as handle: | |
pickle.dump(reads, handle, protocol=pickle.HIGHEST_PROTOCOL) | |
def read_pickled_dict(file_name="aligned_reads_dict.pickle"): | |
with open(file_name, 'rb') as handle: | |
reads = pickle.load(handle) | |
return reads | |
def bubble_chain_length(nodes, bubble_chains, k, read): | |
all_lengths = dict() | |
maxim = 0 | |
for chain in read.keys(): | |
all_lengths[chain] = 0 | |
list_of_bubbles = set() | |
for node in read[chain]: | |
list_of_bubbles.add(nodes[node].which_b) | |
for b in list_of_bubbles: | |
for n in bubble_chains[chain][b]: | |
all_lengths[chain] += nodes[n].seq_len - k | |
if all_lengths[chain] > maxim: | |
maxim = all_lengths[chain] | |
for k in all_lengths.keys(): | |
if all_lengths[k] < maxim: | |
del read[k] | |
return read | |
def return_maximal(read): | |
all_lengths = {k: len(v) for (k, v) in read.items()} | |
maxim = 0 | |
for v in all_lengths.values(): | |
if maxim < v: | |
maxim = v | |
for k in all_lengths.keys(): | |
if all_lengths[k] < maxim: | |
del read[k] | |
return read | |
############################################################################### | |
if __name__ == "__main__": | |
if len(sys.argv) < 4: | |
print("You need to give the GFA file then the k-mer length then the GAM file as arguments") | |
sys.exit(1) | |
start = time.time() | |
print("reading nodes...") | |
nodes, bubble_chains = read_gfa(sys.argv[1], modified=True) | |
end = time.time() | |
print("took {} seconds".format(end - start)) | |
print("reading gam file and building readsets...") | |
chains = build_reads(nodes, sys.argv[3]) | |
end = time.time() | |
print("took {} seconds".format(end - start)) | |
# | |
print("filtering read sets...") | |
for chain_n, chain in chains.items(): | |
reads_to_remove = [] | |
for read_name, read in chain.items(): | |
if len(read) < 2: # read covered just one bubble or none we remove it | |
reads_to_remove.append(read_name) | |
for r in reads_to_remove: | |
del chain[r] | |
# reads_to_break_tie = [] | |
# for read_key, read in reads.items(): | |
# return_maximal(read) | |
# bubble_chain_length(nodes, bubble_chains, int(sys.argv[2]), read) | |
# if len(read) > 1: | |
# # if there's still a tie, I'll just take the first chain it belongs too | |
# reads_to_break_tie.append(read_key) | |
# elif len(read) == 0: | |
# reads_to_remove.append(read_key) | |
# | |
# for k in reads_to_break_tie: | |
# reads[k] = {list(reads[k].keys())[0]: reads[k][list(reads[k].keys())[0]]} | |
# | |
# reads_branching_dist = dict() | |
# | |
# for read in reads.values(): | |
# | |
# if len(read) not in reads_branching_dist: | |
# reads_branching_dist[len(read)] = 1 | |
# else: | |
# reads_branching_dist[len(read)] += 1 | |
# | |
# for k in reads_to_remove: | |
# del reads[k] | |
# end = time.time() | |
# print("took {} seconds".format(end - start)) | |
print("dumping pickled reads...") | |
dump_pickled_dict(chains) | |
end = time.time() | |
print("took {} seconds".format(end - start)) |