output_aligned_reads_dict.py

import os
import sys
import stream
import pickle
import time
from vg_pb2 import Alignment


class Node:
    def __init__(self, identifier):
        self.id = identifier
        self.seq = ""
        self.seq_len = 0
        self.start = []
        self.end = []
        self.visited = False
        self.which_chain = 0
        self.which_sb = 0
        self.which_b = 0
        self.which_allele = -1


def read_gfa(gfa_file_path, modified=False):
    """
    :param gfa_file_path: gfa graph file.
    :param modified: if I'm reading my modified GFA with extra information for the nodes
    :return: Dictionary of node ids and Node objects.
    """
    if not os.path.exists(gfa_file_path):
        print("the gfa file path you gave does not exists, please try again!")
        sys.exit()

    bubble_chains = dict()
    nodes = dict()
    edges = []
    with open(gfa_file_path, "r") as lines:
        for line in lines:
            if line.startswith("S"):
                if modified:
                    line = line.split("\t")
                    n_id = int(line[1])
                    nodes[n_id] = Node(n_id)
                    nodes[n_id].seq_len = len(line[2])
                    nodes[n_id].seq = str(line[2])
                    # this is my extra columns
                    # constructed as which_chain:which_sb:which_b:which_allele
                    # e.g. 5:0:3:1 (chain 5, not part of a superbubble, bubble 3, allele 1)
                    specifications = str(line[3])
                    specifications = specifications.split(":")
                    nodes[n_id].which_chain = int(specifications[0])
                    nodes[n_id].which_sb = int(specifications[1])
                    nodes[n_id].which_b = int(specifications[2])
                    nodes[n_id].which_allele = int(specifications[3])

                    if nodes[n_id].which_chain not in bubble_chains:
                        bubble_chains[nodes[n_id].which_chain] = {nodes[n_id].which_b: [n_id]}
                    else:
                        if nodes[n_id].which_b not in bubble_chains[nodes[n_id].which_chain]:
                            bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b] = [n_id]
                        else:
                            bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b].append(n_id)

                else:

                    line = line.split()
                    n_id = int(line[1])
                    n_len = len(line[2])
                    nodes[n_id] = Node(n_id)
                    nodes[n_id].seq_len = n_len
                    nodes[n_id].seq = str(line[2])

            elif line.startswith("L"):
                continue

    # for e in edges:
    #     line = e.split()
    #
    #     k = int(line[1])
    #     neighbor = int(line[3])
    #     if (k not in nodes) or (neighbor not in nodes):
    #         continue
    #
    #     if line[2] == "-":
    #         from_start = True
    #     else:
    #         from_start = False
    #
    #     if line[4] == "-":
    #         to_end = True
    #     else:
    #         to_end = False
    #
    #     if from_start is True and to_end is True:  # from start to end L x - y -
    #         if (neighbor, 1) not in nodes[k].start:
    #             nodes[k].start.append((neighbor, 1))
    #         if (k, 0) not in nodes[neighbor].end:
    #             nodes[neighbor].end.append((k, 0))
    #
    #     elif from_start is True and to_end is False:  # from start to start L x - y +
    #
    #         if (neighbor, 0) not in nodes[k].start:
    #             nodes[k].start.append((neighbor, 0))
    #
    #         if (k, 0) not in nodes[neighbor].start:
    #             nodes[neighbor].start.append((k, 0))
    #
    #     elif from_start is False and to_end is False:  # from end to start L x + y +
    #         if (neighbor, 0) not in nodes[k].end:
    #             nodes[k].end.append((neighbor, 0))
    #
    #         if (k, 1) not in nodes[neighbor].start:
    #             nodes[neighbor].start.append((k, 1))
    #
    #     elif from_start is False and to_end is True:  # from end to end L x + y -
    #         if (neighbor, 1) not in nodes[k].end:
    #             nodes[k].end.append((neighbor, 1))
    #
    #         if (k, 1) not in nodes[neighbor].end:
    #             nodes[neighbor].end.append((k, 1))

    return nodes, bubble_chains


def build_reads(nodes, gam_file_path):
    """
    :param nodes: dictionary of nodes objects
    :param gam_file_path: gam file path
    :return: returns a dictionary of readsets, where each read set object represent a bubble chain
    """

    with stream.open(str(gam_file_path), "rb") as instream:
        chains = dict()
        counter = 0
        for data in instream:
            counter += 1
            if (counter % 1500000) == 0:
                print("Processed {} reads".format(counter))
            g = Alignment()
            g.ParseFromString(data)

            # construct read object
            # read = Read(g.name)  # leave other values as default
            # if g.name not in reads:
            #     reads[g.name] = dict()

            for m in g.path.mapping:

                n_id = int(m.position.node_id)

                if n_id not in nodes:  # in case one of the nodes wasn't in the graph
                    continue

                if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1):
                    if nodes[n_id].which_chain not in chains:
                        chains[nodes[n_id].which_chain] = dict()
                        chains[nodes[n_id].which_chain][g.name] = [n_id]
                    else:
                        if g.name not in chains[nodes[n_id].which_chain]:
                            chains[nodes[n_id].which_chain][g.name] = [n_id]
                        else:
                            chains[nodes[n_id].which_chain][g.name].append(n_id)
                # if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1):
                #     if nodes[n_id].which_chain in reads[g.name]:
                #         reads[g.name][nodes[n_id].which_chain].append(n_id)
                #     else:
                #         reads[g.name][nodes[n_id].which_chain] = [n_id]

    return chains


def dump_pickled_dict(reads, file_name='aligned_reads_dict.pickle'):
    with open(file_name, 'wb') as handle:
        pickle.dump(reads, handle, protocol=pickle.HIGHEST_PROTOCOL)


def read_pickled_dict(file_name="aligned_reads_dict.pickle"):
    with open(file_name, 'rb') as handle:
        reads = pickle.load(handle)
    return reads


def bubble_chain_length(nodes, bubble_chains, k, read):

    all_lengths = dict()
    maxim = 0
    for chain in read.keys():
        all_lengths[chain] = 0
        list_of_bubbles = set()
        for node in read[chain]:
            list_of_bubbles.add(nodes[node].which_b)
        for b in list_of_bubbles:
            for n in bubble_chains[chain][b]:
                all_lengths[chain] += nodes[n].seq_len - k
        if all_lengths[chain] > maxim:
            maxim = all_lengths[chain]
    for k in all_lengths.keys():
        if all_lengths[k] < maxim:
            del read[k]

    return read


def return_maximal(read):

    all_lengths = {k: len(v) for (k, v) in read.items()}
    maxim = 0
    for v in all_lengths.values():
        if maxim < v:
            maxim = v
    for k in all_lengths.keys():
        if all_lengths[k] < maxim:
            del read[k]
    return read


###############################################################################
if __name__ == "__main__":
    if len(sys.argv) < 4:
        print("You need to give the GFA file then the k-mer length then the GAM file as arguments")
        sys.exit(1)

    start = time.time()

    print("reading nodes...")
    nodes, bubble_chains = read_gfa(sys.argv[1], modified=True)
    end = time.time()
    print("took {} seconds".format(end - start))
    print("reading gam file and building readsets...")
    chains = build_reads(nodes, sys.argv[3])
    end = time.time()
    print("took {} seconds".format(end - start))

    #
    print("filtering read sets...")
    for chain_n, chain in chains.items():
        reads_to_remove = []
        for read_name, read in chain.items():
            if len(read) < 2:  # read covered just one bubble or none we remove it
                reads_to_remove.append(read_name)
        for r in reads_to_remove:
            del chain[r]
    # reads_to_break_tie = []
    # for read_key, read in reads.items():
    #     return_maximal(read)
    #     bubble_chain_length(nodes, bubble_chains, int(sys.argv[2]), read)
    #     if len(read) > 1:
    #         # if there's still a tie, I'll just take the first chain it belongs too
    #         reads_to_break_tie.append(read_key)
    #     elif len(read) == 0:
    #         reads_to_remove.append(read_key)
    #
    # for k in reads_to_break_tie:
    #     reads[k] = {list(reads[k].keys())[0]: reads[k][list(reads[k].keys())[0]]}
    #
    # reads_branching_dist = dict()
    #
    # for read in reads.values():
    #
    #     if len(read) not in reads_branching_dist:
    #         reads_branching_dist[len(read)] = 1
    #     else:
    #         reads_branching_dist[len(read)] += 1
    #
    # for k in reads_to_remove:
    #     del reads[k]
    # end = time.time()
    # print("took {} seconds".format(end - start))

    print("dumping pickled reads...")
    dump_pickled_dict(chains)
    end = time.time()
    print("took {} seconds".format(end - start))
	import os
	import sys
	import stream
	import pickle
	import time
	from vg_pb2 import Alignment


	class Node:
	def __init__(self, identifier):
	self.id = identifier
	self.seq = ""
	self.seq_len = 0
	self.start = []
	self.end = []
	self.visited = False
	self.which_chain = 0
	self.which_sb = 0
	self.which_b = 0
	self.which_allele = -1


	def read_gfa(gfa_file_path, modified=False):
	"""
	:param gfa_file_path: gfa graph file.
	:param modified: if I'm reading my modified GFA with extra information for the nodes
	:return: Dictionary of node ids and Node objects.
	"""
	if not os.path.exists(gfa_file_path):
	print("the gfa file path you gave does not exists, please try again!")
	sys.exit()

	bubble_chains = dict()
	nodes = dict()
	edges = []
	with open(gfa_file_path, "r") as lines:
	for line in lines:
	if line.startswith("S"):
	if modified:
	line = line.split("\t")
	n_id = int(line[1])
	nodes[n_id] = Node(n_id)
	nodes[n_id].seq_len = len(line[2])
	nodes[n_id].seq = str(line[2])
	# this is my extra columns
	# constructed as which_chain:which_sb:which_b:which_allele
	# e.g. 5:0:3:1 (chain 5, not part of a superbubble, bubble 3, allele 1)
	specifications = str(line[3])
	specifications = specifications.split(":")
	nodes[n_id].which_chain = int(specifications[0])
	nodes[n_id].which_sb = int(specifications[1])
	nodes[n_id].which_b = int(specifications[2])
	nodes[n_id].which_allele = int(specifications[3])

	if nodes[n_id].which_chain not in bubble_chains:
	bubble_chains[nodes[n_id].which_chain] = {nodes[n_id].which_b: [n_id]}
	else:
	if nodes[n_id].which_b not in bubble_chains[nodes[n_id].which_chain]:
	bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b] = [n_id]
	else:
	bubble_chains[nodes[n_id].which_chain][nodes[n_id].which_b].append(n_id)

	else:

	line = line.split()
	n_id = int(line[1])
	n_len = len(line[2])
	nodes[n_id] = Node(n_id)
	nodes[n_id].seq_len = n_len
	nodes[n_id].seq = str(line[2])

	elif line.startswith("L"):
	continue

	# for e in edges:
	# line = e.split()
	#
	# k = int(line[1])
	# neighbor = int(line[3])
	# if (k not in nodes) or (neighbor not in nodes):
	# continue
	#
	# if line[2] == "-":
	# from_start = True
	# else:
	# from_start = False
	#
	# if line[4] == "-":
	# to_end = True
	# else:
	# to_end = False
	#
	# if from_start is True and to_end is True: # from start to end L x - y -
	# if (neighbor, 1) not in nodes[k].start:
	# nodes[k].start.append((neighbor, 1))
	# if (k, 0) not in nodes[neighbor].end:
	# nodes[neighbor].end.append((k, 0))
	#
	# elif from_start is True and to_end is False: # from start to start L x - y +
	#
	# if (neighbor, 0) not in nodes[k].start:
	# nodes[k].start.append((neighbor, 0))
	#
	# if (k, 0) not in nodes[neighbor].start:
	# nodes[neighbor].start.append((k, 0))
	#
	# elif from_start is False and to_end is False: # from end to start L x + y +
	# if (neighbor, 0) not in nodes[k].end:
	# nodes[k].end.append((neighbor, 0))
	#
	# if (k, 1) not in nodes[neighbor].start:
	# nodes[neighbor].start.append((k, 1))
	#
	# elif from_start is False and to_end is True: # from end to end L x + y -
	# if (neighbor, 1) not in nodes[k].end:
	# nodes[k].end.append((neighbor, 1))
	#
	# if (k, 1) not in nodes[neighbor].end:
	# nodes[neighbor].end.append((k, 1))

	return nodes, bubble_chains


	def build_reads(nodes, gam_file_path):
	"""
	:param nodes: dictionary of nodes objects
	:param gam_file_path: gam file path
	:return: returns a dictionary of readsets, where each read set object represent a bubble chain
	"""

	with stream.open(str(gam_file_path), "rb") as instream:
	chains = dict()
	counter = 0
	for data in instream:
	counter += 1
	if (counter % 1500000) == 0:
	print("Processed {} reads".format(counter))
	g = Alignment()
	g.ParseFromString(data)

	# construct read object
	# read = Read(g.name) # leave other values as default
	# if g.name not in reads:
	# reads[g.name] = dict()

	for m in g.path.mapping:

	n_id = int(m.position.node_id)

	if n_id not in nodes: # in case one of the nodes wasn't in the graph
	continue

	if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1):
	if nodes[n_id].which_chain not in chains:
	chains[nodes[n_id].which_chain] = dict()
	chains[nodes[n_id].which_chain][g.name] = [n_id]
	else:
	if g.name not in chains[nodes[n_id].which_chain]:
	chains[nodes[n_id].which_chain][g.name] = [n_id]
	else:
	chains[nodes[n_id].which_chain][g.name].append(n_id)
	# if (int(nodes[n_id].which_allele) == 0) or (int(nodes[n_id].which_allele) == 1):
	# if nodes[n_id].which_chain in reads[g.name]:
	# reads[g.name][nodes[n_id].which_chain].append(n_id)
	# else:
	# reads[g.name][nodes[n_id].which_chain] = [n_id]

	return chains


	def dump_pickled_dict(reads, file_name='aligned_reads_dict.pickle'):
	with open(file_name, 'wb') as handle:
	pickle.dump(reads, handle, protocol=pickle.HIGHEST_PROTOCOL)


	def read_pickled_dict(file_name="aligned_reads_dict.pickle"):
	with open(file_name, 'rb') as handle:
	reads = pickle.load(handle)
	return reads


	def bubble_chain_length(nodes, bubble_chains, k, read):

	all_lengths = dict()
	maxim = 0
	for chain in read.keys():
	all_lengths[chain] = 0
	list_of_bubbles = set()
	for node in read[chain]:
	list_of_bubbles.add(nodes[node].which_b)
	for b in list_of_bubbles:
	for n in bubble_chains[chain][b]:
	all_lengths[chain] += nodes[n].seq_len - k
	if all_lengths[chain] > maxim:
	maxim = all_lengths[chain]
	for k in all_lengths.keys():
	if all_lengths[k] < maxim:
	del read[k]

	return read


	def return_maximal(read):

	all_lengths = {k: len(v) for (k, v) in read.items()}
	maxim = 0
	for v in all_lengths.values():
	if maxim < v:
	maxim = v
	for k in all_lengths.keys():
	if all_lengths[k] < maxim:
	del read[k]
	return read


	###############################################################################
	if __name__ == "__main__":
	if len(sys.argv) < 4:
	print("You need to give the GFA file then the k-mer length then the GAM file as arguments")
	sys.exit(1)

	start = time.time()

	print("reading nodes...")
	nodes, bubble_chains = read_gfa(sys.argv[1], modified=True)
	end = time.time()
	print("took {} seconds".format(end - start))
	print("reading gam file and building readsets...")
	chains = build_reads(nodes, sys.argv[3])
	end = time.time()
	print("took {} seconds".format(end - start))

	#
	print("filtering read sets...")
	for chain_n, chain in chains.items():
	reads_to_remove = []
	for read_name, read in chain.items():
	if len(read) < 2: # read covered just one bubble or none we remove it
	reads_to_remove.append(read_name)
	for r in reads_to_remove:
	del chain[r]
	# reads_to_break_tie = []
	# for read_key, read in reads.items():
	# return_maximal(read)
	# bubble_chain_length(nodes, bubble_chains, int(sys.argv[2]), read)
	# if len(read) > 1:
	# # if there's still a tie, I'll just take the first chain it belongs too
	# reads_to_break_tie.append(read_key)
	# elif len(read) == 0:
	# reads_to_remove.append(read_key)
	#
	# for k in reads_to_break_tie:
	# reads[k] = {list(reads[k].keys())[0]: reads[k][list(reads[k].keys())[0]]}
	#
	# reads_branching_dist = dict()
	#
	# for read in reads.values():
	#
	# if len(read) not in reads_branching_dist:
	# reads_branching_dist[len(read)] = 1
	# else:
	# reads_branching_dist[len(read)] += 1
	#
	# for k in reads_to_remove:
	# del reads[k]
	# end = time.time()
	# print("took {} seconds".format(end - start))

	print("dumping pickled reads...")
	dump_pickled_dict(chains)
	end = time.time()
	print("took {} seconds".format(end - start))