This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
thesis_tm/phasing_bubbles.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
151 lines (116 sloc)
5.13 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# todo this script only needs to read the modified GFA and make a nodes dictionary for constant access | |
# then reads the gam file and uses the nodes dicitonary to fill in read sets (simple) | |
import sys | |
import os | |
from vg_pb2 import Alignment | |
import stream | |
from optparse import OptionParser | |
class Node: | |
def __init__(self, identifier): | |
self.id = identifier # size is between 28 and 32 bytes | |
self.seq = "" | |
self.seq_len = 0 # mostly 28 bytes | |
self.start = [] # 96 bytes for 4 neighbors | |
self.end = [] # 96 bytes | |
self.visited = False # 28 bytes (used for bubble and superbubble detection) | |
self.which_chain = 0 | |
self.which_sb = 0 | |
self.which_b = 0 | |
self.which_allele = -1 | |
parser = OptionParser() | |
parser.add_option("-g", "--in_gfa", action="store", dest="in_graph", default=None, type=str, | |
help="Give the modified GFA file path here") | |
parser.add_option("-a", "--in_gam", action="store", dest="k_mer", default=None, type=str, | |
help="Give the GAM alignment file path here") | |
def read_gfa(gfa_file_path, modified=False): | |
""" | |
:param gfa_file_path: gfa graph file. | |
:param modified: if I'm reading my modified GFA with extra information for the nodes | |
:return: Dictionary of node ids and Node objects. | |
""" | |
if not os.path.exists(gfa_file_path): | |
print("the gfa file path you gave does not exists, please try again!") | |
sys.exit() | |
nodes = dict() | |
edges = [] | |
with open(gfa_file_path, "r") as lines: | |
for line in lines: | |
if line.startswith("S"): | |
if modified: | |
line = line.split("\t") | |
n_id = int(line[1]) | |
n_len = len(line[2]) | |
nodes[n_id] = Node(n_id) | |
nodes[n_id].seq_len = n_len | |
nodes[n_id].seq = str(line[2]) | |
specifications = str(line[3]) | |
print(specifications) | |
specifications = specifications.split(":") | |
nodes[n_id].which_chain = int(specifications[0]) | |
nodes[n_id].which_sb = int(specifications[1]) | |
nodes[n_id].which_b = int(specifications[2]) | |
nodes[n_id].which_allele = int(specifications[3]) | |
else: | |
line = line.split() | |
n_id = int(line[1]) | |
n_len = len(line[2]) | |
nodes[n_id] = Node(n_id) | |
nodes[n_id].seq_len = n_len | |
nodes[n_id].seq = str(line[2]) | |
elif line.startswith("L"): | |
edges.append(line) | |
for e in edges: | |
line = e.split() | |
k = int(line[1]) | |
neighbor = int(line[3]) | |
if line[2] == "-": | |
from_start = True | |
else: | |
from_start = False | |
if line[4] == "-": | |
to_end = True | |
else: | |
to_end = False | |
if from_start is True and to_end is True: # from start to end L x - y - | |
if (neighbor, 1) not in nodes[k].start: | |
nodes[k].start.append((neighbor, 1)) | |
if (k, 0) not in nodes[neighbor].end: | |
nodes[neighbor].end.append((k, 0)) | |
elif from_start is True and to_end is False: # from start to start L x - y + | |
if (neighbor, 0) not in nodes[k].start: | |
nodes[k].start.append((neighbor, 0)) | |
if (k, 0) not in nodes[neighbor].start: | |
nodes[neighbor].start.append((k, 0)) | |
elif from_start is False and to_end is False: # from end to start L x + y + | |
if (neighbor, 0) not in nodes[k].end: | |
nodes[k].end.append((neighbor, 0)) | |
if (k, 1) not in nodes[neighbor].start: | |
nodes[neighbor].start.append((k, 1)) | |
elif from_start is False and to_end is True: # from end to end L x + y - | |
if (neighbor, 1) not in nodes[k].end: | |
nodes[k].end.append((neighbor, 1)) | |
if (k, 1) not in nodes[neighbor].end: | |
nodes[neighbor].end.append((k, 1)) | |
return nodes | |
def read_process_gam(nodes, gam_file_path): | |
read_sets = {} | |
with stream.open(str(gam_file_path), "rb") as instream: | |
for data in instream: | |
g = Alignment() | |
g.ParseFromString(data) | |
# construct read object | |
read = Read(g.name) # leave other values as default | |
for m in g.path.mapping: | |
n_id = int(m.position.node_id) | |
if n_id not in nodes: | |
continue | |
if nodes[n_id].which_chain not in read_sets: | |
read_sets[nodes[n_id].which_chain] = ReadSet() | |
if 0 >= nodes[n_id].which_allel <= 1: # otherwise it's an end node or a superbubble node | |
# bubbles are already sorted | |
read.add_variant(nodes[n_id].which_b, nodes[n_id].which_allele, 30) | |
read_sets[nodes[n_id].which_chain].add_read(read) | |
return read_sets | |
nodes = read_gfa(sys.argv[1], modified=True) | |
gam_file = sys.argv[2] | |
all_read_sets = read_process_gam(nodes, gam_file) |