Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# todo this script only needs to read the modified GFA and make a nodes dictionary for constant access
# then reads the gam file and uses the nodes dicitonary to fill in read sets (simple)
import sys
import os
from vg_pb2 import Alignment
import stream
from optparse import OptionParser
class Node:
def __init__(self, identifier):
self.id = identifier # size is between 28 and 32 bytes
self.seq = ""
self.seq_len = 0 # mostly 28 bytes
self.start = [] # 96 bytes for 4 neighbors
self.end = [] # 96 bytes
self.visited = False # 28 bytes (used for bubble and superbubble detection)
self.which_chain = 0
self.which_sb = 0
self.which_b = 0
self.which_allele = -1
parser = OptionParser()
parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str,
help="Give the modified GFA file path here")
parser.add_option("-a", "--in_gam", action="store", dest="k_mer", default=None, type=str,
help="Give the GAM alignment file path here")
def read_gfa(gfa_file_path, modified=False):
"""
:param gfa_file_path: gfa graph file.
:param modified: if I'm reading my modified GFA with extra information for the nodes
:return: Dictionary of node ids and Node objects.
"""
if not os.path.exists(gfa_file_path):
print("the gfa file path you gave does not exists, please try again!")
sys.exit()
nodes = dict()
edges = []
with open(gfa_file_path, "r") as lines:
for line in lines:
if line.startswith("S"):
if modified:
line = line.split("\t")
n_id = int(line[1])
n_len = len(line[2])
nodes[n_id] = Node(n_id)
nodes[n_id].seq_len = n_len
nodes[n_id].seq = str(line[2])
specifications = str(line[3])
print(specifications)
specifications = specifications.split(":")
nodes[n_id].which_chain = int(specifications[0])
nodes[n_id].which_sb = int(specifications[1])
nodes[n_id].which_b = int(specifications[2])
nodes[n_id].which_allele = int(specifications[3])
else:
line = line.split()
n_id = int(line[1])
n_len = len(line[2])
nodes[n_id] = Node(n_id)
nodes[n_id].seq_len = n_len
nodes[n_id].seq = str(line[2])
elif line.startswith("L"):
edges.append(line)
for e in edges:
line = e.split()
k = int(line[1])
neighbor = int(line[3])
if line[2] == "-":
from_start = True
else:
from_start = False
if line[4] == "-":
to_end = True
else:
to_end = False
if from_start is True and to_end is True: # from start to end L x - y -
if (neighbor, 1) not in nodes[k].start:
nodes[k].start.append((neighbor, 1))
if (k, 0) not in nodes[neighbor].end:
nodes[neighbor].end.append((k, 0))
elif from_start is True and to_end is False: # from start to start L x - y +
if (neighbor, 0) not in nodes[k].start:
nodes[k].start.append((neighbor, 0))
if (k, 0) not in nodes[neighbor].start:
nodes[neighbor].start.append((k, 0))
elif from_start is False and to_end is False: # from end to start L x + y +
if (neighbor, 0) not in nodes[k].end:
nodes[k].end.append((neighbor, 0))
if (k, 1) not in nodes[neighbor].start:
nodes[neighbor].start.append((k, 1))
elif from_start is False and to_end is True: # from end to end L x + y -
if (neighbor, 1) not in nodes[k].end:
nodes[k].end.append((neighbor, 1))
if (k, 1) not in nodes[neighbor].end:
nodes[neighbor].end.append((k, 1))
return nodes
def read_process_gam(nodes, gam_file_path):
read_sets = {}
with stream.open(str(gam_file_path), "rb") as instream:
for data in instream:
g = Alignment()
g.ParseFromString(data)
# construct read object
read = Read(g.name) # leave other values as default
for m in g.path.mapping:
n_id = int(m.position.node_id)
if n_id not in nodes:
continue
if nodes[n_id].which_chain not in read_sets:
read_sets[nodes[n_id].which_chain] = ReadSet()
if 0 >= nodes[n_id].which_allel <= 1: # otherwise it's an end node or a superbubble node
# bubbles are already sorted
read.add_variant(nodes[n_id].which_b, nodes[n_id].which_allele, 30)
read_sets[nodes[n_id].which_chain].add_read(read)
return read_sets
nodes = read_gfa(sys.argv[1], modified=True)
gam_file = sys.argv[2]
all_read_sets = read_process_gam(nodes, gam_file)