Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import os
import sys
import stream
import pickle
import time
from vg_pb2 import Alignment
class Node:
def __init__(self, identifier):
self.id = identifier
self.seq = ""
self.seq_len = 0
self.start = []
self.end = []
self.visited = False
self.which_chain = 0
self.which_sb = 0
self.which_b = 0
self.which_allele = -1
def read_gfa(gfa_file_path, modified=False):
"""
:param gfa_file_path: gfa graph file.
:param modified: if I'm reading my modified GFA with extra information for the nodes
:return: Dictionary of node ids and Node objects.
"""
if not os.path.exists(gfa_file_path):
print("the gfa file path you gave does not exists, please try again!")
sys.exit()
bubble_chains = dict()
nodes = dict()
edges = []
with open(gfa_file_path, "r") as lines:
for line in lines:
if line.startswith("S"):
if modified:
line = line.split("\t")
n_id = int(line[1])
nodes[n_id] = Node(n_id)
nodes[n_id].seq_len = len(line[2])
nodes[n_id].seq = str(line[2])
# this is my extra columns
# constructed as which_chain:which_sb:which_b:which_allele
# e.g. 5:0:3:1 (chain 5, not part of a superbubble, bubble 3, allele 1)
specifications = str(line[3])
specifications = specifications.split(":")
nodes[n_id].which_chain = int(specifications[0])
nodes[n_id].which_sb = int(specifications[1])
nodes[n_id].which_b = int(specifications[2])
nodes[n_id].which_allele = int(specifications[3])
if nodes[n_id].which_chain not in bubble_chains:
bubble_chains[nodes[n_id].which_chain] = {nodes[n_id].which_b: [n_id]}
else:
if nodes[n_id].which_b not in bubble_chains[nodes[n_id].which_chain]:
bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b] = [n_id]
else:
bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b].append(n_id)
else:
line = line.split()
n_id = int(line[1])
n_len = len(line[2])
nodes[n_id] = Node(n_id)
nodes[n_id].seq_len = n_len
nodes[n_id].seq = str(line[2])
elif line.startswith("L"):
continue
# for e in edges:
# line = e.split()
#
# k = int(line[1])
# neighbor = int(line[3])
# if (k not in nodes) or (neighbor not in nodes):
# continue
#
# if line[2] == "-":
# from_start = True
# else:
# from_start = False
#
# if line[4] == "-":
# to_end = True
# else:
# to_end = False
#
# if from_start is True and to_end is True: # from start to end L x - y -
# if (neighbor, 1) not in nodes[k].start:
# nodes[k].start.append((neighbor, 1))
# if (k, 0) not in nodes[neighbor].end:
# nodes[neighbor].end.append((k, 0))
#
# elif from_start is True and to_end is False: # from start to start L x - y +
#
# if (neighbor, 0) not in nodes[k].start:
# nodes[k].start.append((neighbor, 0))
#
# if (k, 0) not in nodes[neighbor].start:
# nodes[neighbor].start.append((k, 0))
#
# elif from_start is False and to_end is False: # from end to start L x + y +
# if (neighbor, 0) not in nodes[k].end:
# nodes[k].end.append((neighbor, 0))
#
# if (k, 1) not in nodes[neighbor].start:
# nodes[neighbor].start.append((k, 1))
#
# elif from_start is False and to_end is True: # from end to end L x + y -
# if (neighbor, 1) not in nodes[k].end:
# nodes[k].end.append((neighbor, 1))
#
# if (k, 1) not in nodes[neighbor].end:
# nodes[neighbor].end.append((k, 1))
return nodes, bubble_chains
def build_reads(nodes, gam_file_path):
"""
:param nodes: dictionary of nodes objects
:param gam_file_path: gam file path
:return: returns a dictionary of readsets, where each read set object represent a bubble chain
"""
with stream.open(str(gam_file_path), "rb") as instream:
chains = dict()
counter = 0
for data in instream:
counter += 1
if (counter % 1500000) == 0:
print("Processed {} reads".format(counter))
g = Alignment()
g.ParseFromString(data)
# construct read object
# read = Read(g.name) # leave other values as default
# if g.name not in reads:
# reads[g.name] = dict()
for m in g.path.mapping:
n_id = int(m.position.node_id)
if n_id not in nodes: # in case one of the nodes wasn't in the graph
continue
if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1):
if nodes[n_id].which_chain not in chains:
chains[nodes[n_id].which_chain] = dict()
chains[nodes[n_id].which_chain][g.name] = [n_id]
else:
if g.name not in chains[nodes[n_id].which_chain]:
chains[nodes[n_id].which_chain][g.name] = [n_id]
else:
chains[nodes[n_id].which_chain][g.name].append(n_id)
# if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1):
# if nodes[n_id].which_chain in reads[g.name]:
# reads[g.name][nodes[n_id].which_chain].append(n_id)
# else:
# reads[g.name][nodes[n_id].which_chain] = [n_id]
return chains
def dump_pickled_dict(reads, file_name='aligned_reads_dict.pickle'):
with open(file_name, 'wb') as handle:
pickle.dump(reads, handle, protocol=pickle.HIGHEST_PROTOCOL)
def read_pickled_dict(file_name="aligned_reads_dict.pickle"):
with open(file_name, 'rb') as handle:
reads = pickle.load(handle)
return reads
def bubble_chain_length(nodes, bubble_chains, k, read):
all_lengths = dict()
maxim = 0
for chain in read.keys():
all_lengths[chain] = 0
list_of_bubbles = set()
for node in read[chain]:
list_of_bubbles.add(nodes[node].which_b)
for b in list_of_bubbles:
for n in bubble_chains[chain][b]:
all_lengths[chain] += nodes[n].seq_len - k
if all_lengths[chain] > maxim:
maxim = all_lengths[chain]
for k in all_lengths.keys():
if all_lengths[k] < maxim:
del read[k]
return read
def return_maximal(read):
all_lengths = {k: len(v) for (k, v) in read.items()}
maxim = 0
for v in all_lengths.values():
if maxim < v:
maxim = v
for k in all_lengths.keys():
if all_lengths[k] < maxim:
del read[k]
return read
###############################################################################
if __name__ == "__main__":
if len(sys.argv) < 4:
print("You need to give the GFA file then the k-mer length then the GAM file as arguments")
sys.exit(1)
start = time.time()
print("reading nodes...")
nodes, bubble_chains = read_gfa(sys.argv[1], modified=True)
end = time.time()
print("took {} seconds".format(end - start))
print("reading gam file and building readsets...")
chains = build_reads(nodes, sys.argv[3])
end = time.time()
print("took {} seconds".format(end - start))
#
print("filtering read sets...")
for chain_n, chain in chains.items():
reads_to_remove = []
for read_name, read in chain.items():
if len(read) < 2: # read covered just one bubble or none we remove it
reads_to_remove.append(read_name)
for r in reads_to_remove:
del chain[r]
# reads_to_break_tie = []
# for read_key, read in reads.items():
# return_maximal(read)
# bubble_chain_length(nodes, bubble_chains, int(sys.argv[2]), read)
# if len(read) > 1:
# # if there's still a tie, I'll just take the first chain it belongs too
# reads_to_break_tie.append(read_key)
# elif len(read) == 0:
# reads_to_remove.append(read_key)
#
# for k in reads_to_break_tie:
# reads[k] = {list(reads[k].keys())[0]: reads[k][list(reads[k].keys())[0]]}
#
# reads_branching_dist = dict()
#
# for read in reads.values():
#
# if len(read) not in reads_branching_dist:
# reads_branching_dist[len(read)] = 1
# else:
# reads_branching_dist[len(read)] += 1
#
# for k in reads_to_remove:
# del reads[k]
# end = time.time()
# print("took {} seconds".format(end - start))
print("dumping pickled reads...")
dump_pickled_dict(chains)
end = time.time()
print("took {} seconds".format(end - start))