phasing_bubbles.py

# todo this script only needs to read the modified GFA and make a nodes dictionary for constant access
#  then reads the gam file and uses the nodes dicitonary to fill in read sets (simple)

import sys
import os
from vg_pb2 import Alignment
import stream
from optparse import OptionParser


class Node:
    def __init__(self, identifier):
        self.id = identifier  # size is between 28 and 32 bytes
        self.seq = ""
        self.seq_len = 0  # mostly 28 bytes
        self.start = []  # 96 bytes for 4 neighbors
        self.end = []  # 96 bytes
        self.visited = False  # 28 bytes (used for bubble and superbubble detection)
        self.which_chain = 0
        self.which_sb = 0
        self.which_b = 0
        self.which_allele = -1


parser = OptionParser()

parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str,
                  help="Give the modified GFA file path here")

parser.add_option("-a", "--in_gam", action="store", dest="k_mer", default=None, type=str,
                  help="Give the GAM alignment file path here")


def read_gfa(gfa_file_path, modified=False):
    """
    :param gfa_file_path: gfa graph file.
    :param modified: if I'm reading my modified GFA with extra information for the nodes
    :return: Dictionary of node ids and Node objects.
    """
    if not os.path.exists(gfa_file_path):
        print("the gfa file path you gave does not exists, please try again!")
        sys.exit()

    nodes = dict()
    edges = []
    with open(gfa_file_path, "r") as lines:
        for line in lines:
            if line.startswith("S"):
                if modified:
                    line = line.split("\t")
                    n_id = int(line[1])
                    n_len = len(line[2])
                    nodes[n_id] = Node(n_id)
                    nodes[n_id].seq_len = n_len
                    nodes[n_id].seq = str(line[2])
                    specifications = str(line[3])
                    print(specifications)
                    specifications = specifications.split(":")
                    nodes[n_id].which_chain = int(specifications[0])
                    nodes[n_id].which_sb = int(specifications[1])
                    nodes[n_id].which_b = int(specifications[2])
                    nodes[n_id].which_allele = int(specifications[3])

                else:

                    line = line.split()
                    n_id = int(line[1])
                    n_len = len(line[2])
                    nodes[n_id] = Node(n_id)
                    nodes[n_id].seq_len = n_len
                    nodes[n_id].seq = str(line[2])

            elif line.startswith("L"):
                edges.append(line)

    for e in edges:
        line = e.split()

        k = int(line[1])
        neighbor = int(line[3])
        if line[2] == "-":
            from_start = True
        else:
            from_start = False

        if line[4] == "-":
            to_end = True
        else:
            to_end = False

        if from_start is True and to_end is True:  # from start to end L x - y -
            if (neighbor, 1) not in nodes[k].start:
                nodes[k].start.append((neighbor, 1))
            if (k, 0) not in nodes[neighbor].end:
                nodes[neighbor].end.append((k, 0))

        elif from_start is True and to_end is False:  # from start to start L x - y +

            if (neighbor, 0) not in nodes[k].start:
                nodes[k].start.append((neighbor, 0))

            if (k, 0) not in nodes[neighbor].start:
                nodes[neighbor].start.append((k, 0))

        elif from_start is False and to_end is False:  # from end to start L x + y +
            if (neighbor, 0) not in nodes[k].end:
                nodes[k].end.append((neighbor, 0))

            if (k, 1) not in nodes[neighbor].start:
                nodes[neighbor].start.append((k, 1))

        elif from_start is False and to_end is True:  # from end to end L x + y -
            if (neighbor, 1) not in nodes[k].end:
                nodes[k].end.append((neighbor, 1))

            if (k, 1) not in nodes[neighbor].end:
                nodes[neighbor].end.append((k, 1))

    return nodes


def read_process_gam(nodes, gam_file_path):
    read_sets = {}
    with stream.open(str(gam_file_path), "rb") as instream:
        for data in instream:
            g = Alignment()
            g.ParseFromString(data)

            # construct read object
            read = Read(g.name)  # leave other values as default

            for m in g.path.mapping:
                n_id = int(m.position.node_id)
                if n_id not in nodes:
                    continue

                if nodes[n_id].which_chain not in read_sets:
                    read_sets[nodes[n_id].which_chain] = ReadSet()

                if 0 >= nodes[n_id].which_allel <= 1:  # otherwise it's an end node or a superbubble node
                    # bubbles are already sorted
                    read.add_variant(nodes[n_id].which_b, nodes[n_id].which_allele, 30)
                    read_sets[nodes[n_id].which_chain].add_read(read)

    return read_sets


nodes = read_gfa(sys.argv[1], modified=True)
gam_file = sys.argv[2]

all_read_sets = read_process_gam(nodes, gam_file)