Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
sv-conflict-analysis/conflict_positions.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
121 lines (103 sloc)
4.08 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from enum import Enum | |
from pyfaidx import Fasta | |
class ConfMode(Enum): | |
CSV = 0 | |
JSON = 1 | |
STRING = 2 | |
class Region(): | |
""" | |
A class that saves the chromosome, start and end positions of a genomic region. | |
""" | |
def __init__(self, s: str, start: int=-1, end: int=-1) -> None: | |
if start == -1: | |
self.chrom, self.start, self.end = Region.from_str(s).vals() | |
else: | |
self.chrom = s | |
self.start = int(start) | |
self.end = int(end) | |
def vals(self): | |
""" Returns all three information points as tuple. """ | |
return self.chrom, self.start, self.end | |
def coordinates(self): | |
""" Returns the start and end points as tuple. """ | |
return (self.start, self.end) | |
def get_reference_seq(self, ref: Fasta): | |
""" Returns the sequence inside the Region of a given reference. """ | |
return ref[self.chrom][self.start:self.end] | |
def with_padding(self, padding=-1): | |
""" | |
Returns a Region object with a padding to both sides of self. | |
If the padding is not chosen, it is set to a fifth of the region's size. | |
""" | |
if padding == -1: | |
padding = self.__len__() // 5 # make the padding depending on the region | |
return Region(self.chrom, max(self.start-padding, 0), self.end+padding) | |
def __len__(self): | |
""" Returns the distance between start and end of the Region. """ | |
return self.end - self.start | |
def __str__(self) -> str: | |
""" | |
format region info as string | |
If one of the values is None, returns 'INVALID REGION' | |
""" | |
if None in [self.chrom, self.start, self.end]: | |
return "INVALID REGION" | |
return self.__format__() | |
def __format__(self, f: str="") -> str: | |
""" | |
Format the Region object as string. | |
If the format string 'f' is of length 1, it is put between chrom, start | |
and end. | |
If it is of length 2, the first character of f is put between chrom and start | |
and f[1] is put between start and end. | |
Examples: | |
__format__("_") --> chr9_123_456 | |
__format__(":-") --> chr9:123-456 | |
""" | |
if f == "": | |
f = ":-" | |
if len(f) == 1: | |
return f"{self.chrom}{f}{self.start}{f}{self.end}" | |
elif len(f) == 2: | |
return f"{self.chrom}{f[0]}{self.start}{f[1]}{self.end}" | |
else: | |
print("Format string not viable!") | |
return str(self) | |
@staticmethod | |
def from_str(s): | |
""" | |
splits a region string into chromosome string and start and end ints. | |
The coordinates can be comma separated and still be parsed. | |
Example: 'chr12:12314-12412' --> ('chr12', 12314, 12412) | |
""" | |
info = s.split(":") | |
chrom = info[0] | |
start, end = map(lambda x: int(x.replace(',', '')), info[1].split("-")) | |
return Region(chrom, start, end) | |
def copy(self): | |
""" Returns a copy of self. """ | |
return Region(self.chrom, self.start, self.end) | |
def get_conflict_position(info, conf_mode=ConfMode.JSON) -> Region: | |
""" | |
Returns the Region of a given conlflict. The processing depends on the get_conflict_position | |
mode used. | |
""" | |
if conf_mode == ConfMode.JSON: | |
positions = [] | |
ends = [] | |
chrom = None | |
for sv in info: | |
info = sv.split("_") | |
# info looks like ["svim", "DEL", "-133", "232424", "323242", "chr18" ...] | |
# thus, we need field 3 and 4 for start and stop, and field 5 for chromosome | |
start_field, end_field, chrom_field = 3, 4, 5 | |
positions.append(int(info[start_field])) | |
ends.append(int(info[end_field])) | |
if chrom is None: | |
chrom = info[chrom_field] | |
return Region(chrom, min(positions), max(ends)) | |
elif conf_mode == ConfMode.CSV: | |
chrom, start, end = info.split(",")[:3] | |
return Region(chrom, int(start), int(end)) | |
else: # conf_mode == ConfMode.STRING: | |
return Region.from_str(info) |