Skip to content
Permalink
b7c80c8559
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
1759 lines (1543 sloc) 80.6 KB
#!/home/basti/miniconda3/envs/mpi-project/bin/python
'''
---------------------------------------------------------------------------------------
CrossMap: lift over genomic coordinates between genome assemblies.
Supports BED/BedGraph, GFF/GTF, BAM/SAM, BigWig/Wig, VCF format files.
------------------------------------------------------------------------------------------
'''
import os,sys
import optparse
import collections
import subprocess
import string
from textwrap import wrap
from time import strftime
import pyBigWig
import pysam
from bx.intervals import *
import numpy as np
import datetime
from cmmodule import ireader
from cmmodule import BED
from cmmodule import annoGene
from cmmodule import sam_header
from cmmodule import myutils
from cmmodule import bgrMerge
__author__ = "Liguo Wang, Hao Zhao"
__contributor__="Liguo Wang, Hao Zhao"
__copyright__ = "Copyleft"
__credits__ = []
__license__ = "GPLv2"
__version__="0.3.1"
__maintainer__ = "Liguo Wang"
__email__ = "wangliguo78@gmail.com"
__status__ = "Production"
def printlog (mesg_lst):
'''
print progress into stderr
'''
if len(mesg_lst)==1:
msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg_lst[0]
else:
msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + ' '.join(mesg_lst)
print(msg, file=sys.stderr)
def parse_header( line ):
return dict( [ field.split( '=' ) for field in line.split()[1:] ] )
def revcomp_DNA(dna):
'''
reverse complement of input DNA sequence.
'''
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A','N':'N','X':'X'}
seq = dna.upper()
return ''.join(complement[base] for base in reversed(seq))
def wiggleReader( f ):
'''
Iterator yielding chrom, start, end, strand, value.
Values are zero-based, half-open.
Regions which lack scores are ignored.
'''
current_chrom = None
current_pos = None
current_step = None
# always for wiggle data
strand = '+'
mode = "bed"
for line in ireader.reader(f):
if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ):
continue
elif line.startswith( "variableStep" ):
header = parse_header( line )
current_chrom = header['chrom']
current_pos = None
current_step = None
if 'span' in header: current_span = int( header['span'] )
else: current_span = 1
mode = "variableStep"
elif line.startswith( "fixedStep" ):
header = parse_header( line )
current_chrom = header['chrom']
current_pos = int( header['start'] ) - 1
current_step = int( header['step'] )
if 'span' in header: current_span = int( header['span'] )
else: current_span = 1
mode = "fixedStep"
elif mode == "bed":
fields = line.split()
if len( fields ) > 3:
if len( fields ) > 5:
yield fields[0], int( fields[1] ), int( fields[2] ), fields[5], float( fields[3] )
else:
yield fields[0], int( fields[1] ), int( fields[2] ), strand, float( fields[3] )
elif mode == "variableStep":
fields = line.split()
pos = int( fields[0] ) - 1
yield current_chrom, pos, pos + current_span, strand, float( fields[1] )
elif mode == "fixedStep":
yield current_chrom, current_pos, current_pos + current_span, strand, float( line.split()[0] )
current_pos += current_step
else:
raise "Unexpected input line: %s" % line.strip()
def bigwigReader(infile, bin_size = 2000):
'''
infile: bigwig format file
chromsize: chrom_name: size.
return: chrom, position (0-based), value
'''
bw = pyBigWig.open(infile)
chrom_sizes = bw.chroms()
#print (chrom_sizes)
for chr_name, chr_size in list(chrom_sizes.items()):
for chrom, st, end in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = bin_size):
try:
a = bw.stats(chrom,st,end)
except:
continue
if bw.stats(chrom,st,end)[0] is None:
continue
sig_list = bw.values(chrom,st,end)
sig_list = np.nan_to_num(sig_list) #change 'nan' into '0.'
low_bound = st
point_num = 1
score = sig_list[0]
for value in (sig_list[1:]):
if value == score:
point_num += 1
else:
yield(( chrom, low_bound, low_bound + point_num, score))
score = value
low_bound = low_bound + point_num
point_num = 1
def check_bed12(bedline):
'''
check if bed12 format is correct or not
'''
fields = bedline.strip().split()
if len(fields) !=12:
return False
if fields[5] not in ['+','-','.']:
return False
try:
chromStart = int(fields[1])
chromEnd = int(fields[2])
thickStart = int(fields[6])
thickEnd = int(fields[7])
blockCount = int(fields[9])
blockSizes = [int(i) for i in fields[10].rstrip(',').split(',')]
blockStarts = [int(i) for i in fields[11].rstrip(',').split(',')]
except:
return False
if chromStart > chromEnd or thickStart > thickEnd:
return False
if thickStart < chromStart or thickEnd > chromEnd:
return False
if len(blockSizes) != blockCount:
return False
if len(blockStarts) != blockCount:
return False
if blockCount <1:
return False
for i in blockSizes:
if i < 0: return False
for i in blockStarts:
if i < 0: return False
return True
def intersectBed(xxx_todo_changeme, xxx_todo_changeme1):
'''
return intersection of two bed regions
'''
(chr1, st1, end1) = xxx_todo_changeme
(chr2, st2, end2) = xxx_todo_changeme1
if int(st1) > int(end1) or int(st2) > int(end2):
raise Exception ("Start cannot be larger than end")
if chr1 != chr2:
return None
if int(st1) > int(end2) or int(end1) < int(st2):
return None
return (chr1, max(st1, st2), min(end1,end2))
def read_chain_file (chain_file, print_table=False):
'''
input chain_file could be either plain text, compressed file (".gz", ".Z", ".z", ".bz", ".bz2", ".bzip2"),
or a URL pointing to the chain file ("http://", "https://", "ftp://"). If url was used, chain file must be plain text
'''
printlog(["Read chain_file: ", chain_file]),
maps={}
target_chromSize={}
source_chromSize={}
if print_table:
blocks=[]
for line in ireader.reader(chain_file):
# Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1
if not line.strip():
continue
line=line.strip()
if line.startswith(('#',' ')):continue
fields = line.split()
if fields[0] == 'chain' and len(fields) in [12, 13]:
score = int(fields[1]) # Alignment score
source_name = fields[2] # E.g. chrY
source_size = int(fields[3]) # Full length of the chromosome
source_strand = fields[4] # Must be +
if source_strand != '+':
raise Exception("Source strand in an .over.chain file must be +. (%s)" % line)
source_start = int(fields[5]) # Start of source region
source_end = int(fields[6]) # End of source region
target_name = fields[7] # E.g. chr5
target_size = int(fields[8]) # Full length of the chromosome
target_strand = fields[9] # + or -
target_start = int(fields[10])
target_end = int(fields[11])
target_chromSize[target_name]= target_size
source_chromSize[source_name] = source_size
if target_strand not in ['+', '-']:
raise Exception("Target strand must be - or +. (%s)" % line)
#if target_strand == '+':
# target_start = int(fields[10])
# target_end = int(fields[11])
#if target_strand == '-':
# target_start = target_size - target_end
# target_end = target_size - target_start
chain_id = None if len(fields) == 12 else fields[12]
if source_name not in maps:
maps[source_name] = Intersecter()
sfrom, tfrom = source_start, target_start
# Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to)
elif fields[0] != 'chain' and len(fields) == 3:
size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2])
if print_table:
if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand))
elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand))
if target_strand == '+':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand)))
elif target_strand == '-':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand)))
sfrom += size + sgap
tfrom += size + tgap
elif fields[0] != 'chain' and len(fields) == 1:
size = int(fields[0])
if print_table:
if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand))
elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand))
if target_strand == '+':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand)))
elif target_strand == '-':
maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand)))
else:
raise Exception("Invalid chain format. (%s)" % line)
if (sfrom + size) != source_end or (tfrom + size) != target_end:
raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header)
if print_table:
for i in blocks:
print('\t'.join([str(n) for n in i]))
return (maps,target_chromSize, source_chromSize)
def map_coordinates(mapping, q_chr, q_start, q_end, q_strand='+', print_match=False):
'''
Map coordinates from source assembly to target assembly
'''
matches=[]
complement ={'+':'-','-':'+'}
if q_chr not in mapping:
return None
else:
targets = mapping[q_chr].find(q_start, q_end)
if len(targets)==0:
return None
elif len(targets)==1:
s_start = targets[0].start
s_end = targets[0].end
t_chrom = targets[0].value[0]
t_start = targets[0].value[1]
t_end = targets[0].value[2]
t_strand = targets[0].value[3]
(chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end))
l_offset = abs(real_start - s_start)
#r_offset = real_end - s_end
size = abs(real_end - real_start)
matches.append( (chr, real_start, real_end,q_strand))
if t_strand == '+':
i_start = t_start + l_offset
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
elif t_strand == '-':
i_start = t_end - l_offset - size
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
else:
raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand)
elif len(targets) > 1:
for t in targets:
s_start = t.start
s_end = t.end
t_chrom = t.value[0]
t_start = t.value[1]
t_end = t.value[2]
t_strand = t.value[3]
(chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end))
l_offset = abs(real_start - s_start)
#r_offset = abs(real_end - s_end)
size = abs(real_end - real_start)
matches.append( (chr, real_start, real_end,q_strand) )
if t_strand == '+':
i_start = t_start + l_offset
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
elif t_strand == '-':
i_start = t_end - l_offset - size
if q_strand == '+':
matches.append( (t_chrom, i_start, i_start + size, t_strand))
else:
matches.append( (t_chrom, i_start, i_start + size, complement[t_strand]))
else:
raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand)
if print_match:
print(matches)
# input: 'chr1',246974830,247024835
# output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')]
# [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)]
return matches
def crossmap_vcf_file(mapping, infile,outfile, liftoverfile, refgenome):
'''
Convert genome coordinates in VCF format.
'''
#index refegenome file if it hasn't been done
if not os.path.exists(refgenome + '.fai'):
printlog(["Creating index for", refgenome])
pysam.faidx(refgenome)
refFasta = pysam.Fastafile(refgenome)
FILE_OUT = open(outfile ,'w')
UNMAP = open(outfile + '.unmap','w')
total = 0
fail = 0
withChr = False # check if the VCF data lines use 'chr1' or '1'
for line in ireader.reader(infile):
if not line.strip():
continue
line=line.strip()
#deal with meta-information lines.
#meta-information lines needed in both mapped and unmapped files
if line.startswith('##fileformat'):
print(line, file=FILE_OUT)
print(line, file=UNMAP)
elif line.startswith('##INFO'):
print(line, file=FILE_OUT)
print(line, file=UNMAP)
elif line.startswith('##FILTER'):
print(line, file=FILE_OUT)
print(line, file=UNMAP)
elif line.startswith('##FORMAT'):
print(line, file=FILE_OUT)
print(line, file=UNMAP)
elif line.startswith('##ALT'):
print(line, file=FILE_OUT)
print(line, file=UNMAP)
elif line.startswith('##SAMPLE'):
print(line, file=FILE_OUT)
print(line, file=UNMAP)
elif line.startswith('##PEDIGREE'):
print(line, file=FILE_OUT)
print(line, file=UNMAP)
#meta-information lines needed in unmapped files
elif line.startswith('##assembly'):
print(line, file=UNMAP)
elif line.startswith('##contig'):
print(line, file=UNMAP)
if 'ID=chr' in line:
withChr = True
#update contig information
elif line.startswith('#CHROM'):
printlog(["Updating contig field ... "])
target_gsize = dict(list(zip(refFasta.references, refFasta.lengths)))
for chr_id in sorted(target_gsize):
if chr_id.startswith('chr'):
if withChr is True:
print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT)
else:
print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr',''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT)
else:
if withChr is True:
print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT)
else:
print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT)
print("##liftOverProgram=CrossMap(https://sourceforge.net/projects/crossmap/)", file=FILE_OUT)
print("##liftOverFile=" + liftoverfile, file=FILE_OUT)
print("##new_reference_genome=" + refgenome, file=FILE_OUT)
print("##liftOverTime=" + datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT)
print(line, file=FILE_OUT)
print(line, file=UNMAP)
else:
if line.startswith('#'):continue
fields = str.split(line,maxsplit=7)
total += 1
if fields[0].startswith('chr'):
chrom = fields[0]
else:
chrom = 'chr' + fields[0]
start = int(fields[1])-1 # 0 based
end = start + len(fields[3])
a = map_coordinates(mapping, chrom, start, end,'+')
if a is None:
print(line, file=UNMAP)
fail += 1
continue
if len(a) == 2:
# update chrom
target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1'
target_start = a[1][1]
target_end = a[1][2]
if withChr is False:
fields[0] = target_chr.replace('chr','')
else:
fields[0] = target_chr
# update start coordinate
fields[1] = target_start + 1
if refFasta.references[0].startswith('chr'):
if target_chr.startswith('chr'):
fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper()
else:
fields[3] = refFasta.fetch('chr'+target_chr,target_start,target_end).upper()
else:
if target_chr.startswith('chr'):
fields[3] = refFasta.fetch(target_chr.replace('chr',''),target_start,target_end).upper()
else:
fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper()
if fields[3] != fields[4]:
print('\t'.join(map(str, fields)), file=FILE_OUT)
else:
print(line, file=UNMAP)
fail += 1
else:
print(line, file=UNMAP)
fail += 1
continue
FILE_OUT.close()
UNMAP.close()
printlog (["Total entries:", str(total)])
printlog (["Failed to map:", str(fail)])
def crossmap_bed_file(mapping, inbed,outfile=None):
'''
Convert genome coordinates (in bed format) between assemblies.
BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1
'''
# check if 'outfile' was set. If not set, print to screen, if set, print to file
if outfile is not None:
FILE_OUT = open(outfile,'w')
UNMAP = open(outfile + '.unmap','w')
else:
pass
for line in ireader.reader(inbed):
if line.startswith('#'):continue
if line.startswith('track'):continue
if line.startswith('browser'):continue
if not line.strip():continue
line=line.strip()
fields=line.split()
strand = '+'
# filter out line less than 3 columns
if len(fields)<3:
print("Less than 3 fields. skip " + line, file=sys.stderr)
if outfile:
print(line, file=UNMAP)
continue
try:
int(fields[1])
except:
print("Start corrdinate is not an integer. skip " + line, file=sys.stderr)
if outfile:
print(line, file=UNMAP)
continue
try:
int(fields[2])
except:
print("End corrdinate is not an integer. skip " + line, file=sys.stderr)
if outfile:
print(line, file=UNMAP)
continue
if int(fields[1]) > int(fields[2]):
print("\"Start\" is larger than \"End\" corrdinate is not an integer. skip " + line, file=sys.stderr)
if outfile:
print(line, file=UNMAP)
continue
# deal with bed less than 12 columns
if len(fields)<12:
# try to reset strand
try:
for f in fields:
if f in ['+','-']:
strand = f
except:
pass
a = map_coordinates(mapping, fields[0], int(fields[1]),int(fields[2]),strand)
# example of a: [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')]
try:
if len(a) % 2 != 0:
if outfile is None:
print(line + '\tFail')
else:
print(line, file=UNMAP)
continue
if len(a) == 2:
#reset fields
fields[0] = a[1][0]
fields[1] = a[1][1]
fields[2] = a[1][2]
for i in range(0,len(fields)): #update the strand information
if fields[i] in ['+','-']:
fields[i] = a[1][3]
if outfile is None:
print(line + '\t->\t' + '\t'.join([str(i) for i in fields]))
else:
print('\t'.join([str(i) for i in fields]), file=FILE_OUT)
if len(a) >2 :
count=0
for j in range(1,len(a),2):
count += 1
fields[0] = a[j][0]
fields[1] = a[j][1]
fields[2] = a[j][2]
for i in range(0,len(fields)): #update the strand information
if fields[i] in ['+','-']:
fields[i] = a[j][3]
if outfile is None:
print(line + '\t'+ '(split.' + str(count) + ':' + ':'.join([str(i) for i in a[j-1]]) + ')\t' + '\t'.join([str(i) for i in fields]))
else:
print('\t'.join([str(i) for i in fields]), file=FILE_OUT)
except:
if outfile is None:
print(line + '\tFail')
else:
print(line, file=UNMAP)
continue
# deal with bed12 and bed12+8 (genePred format)
if len(fields)==12 or len(fields)==20:
strand = fields[5]
if strand not in ['+','-']:
raise Exception("Unknown strand: %s. Can only be '+' or '-'." % strand)
fail_flag=False
exons_old_pos = annoGene.getExonFromLine(line) #[[chr,st,end],[chr,st,end],...]
#print exons_old_pos
exons_new_pos = []
for e_chr, e_start, e_end in exons_old_pos:
a = map_coordinates(mapping, e_chr, e_start, e_end, strand)
if a is None:
fail_flag =True
break
if len(a) == 2:
exons_new_pos.append(a[1])
# a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')]
else:
fail_flag =True
break
if not fail_flag:
# check if all exons were mapped to the same chromosome the same strand
chr_id = set()
exon_strand = set()
for e_chr, e_start, e_end, e_strand in exons_new_pos:
chr_id.add(e_chr)
exon_strand.add(e_strand)
if len(chr_id) != 1 or len(exon_strand) != 1:
fail_flag = True
if not fail_flag:
# build new bed
cds_start_offset = int(fields[6]) - int(fields[1])
cds_end_offset = int(fields[2]) - int(fields[7])
new_chrom = exons_new_pos[0][0]
new_chrom_st = exons_new_pos[0][1]
new_chrom_end = exons_new_pos[-1][2]
new_name = fields[3]
new_score = fields[4]
new_strand = exons_new_pos[0][3]
new_thickStart = new_chrom_st + cds_start_offset
new_thickEnd = new_chrom_end - cds_end_offset
new_ittemRgb = fields[8]
new_blockCount = len(exons_new_pos)
new_blockSizes = ','.join([str(o - n) for m,n,o,p in exons_new_pos])
new_blockStarts = ','.join([str(n - new_chrom_st) for m,n,o,p in exons_new_pos])
new_bedline = '\t'.join(str(i) for i in (new_chrom,new_chrom_st,new_chrom_end,new_name,new_score,new_strand,new_thickStart,new_thickEnd,new_ittemRgb,new_blockCount,new_blockSizes,new_blockStarts))
if check_bed12(new_bedline) is False:
fail_flag = True
else:
if outfile is None:
print(line + '\t->\t' + new_bedline)
else:
print(new_bedline, file=FILE_OUT)
if fail_flag:
if outfile is None:
print(line + '\tFail')
else:
print(line, file=UNMAP)
def crossmap_gff_file(mapping, ingff,outfile=None):
'''
Convert genome coordinates (in GFF/GTF format) between assemblies.
GFF (General Feature Format) lines have nine required fields that must be Tab-separated:
1. seqname - The name of the sequence. Must be a chromosome or scaffold.
2. source - The program that generated this feature.
3. feature - The name of this type of feature. Some examples of standard feature types
are "CDS", "start_codon", "stop_codon", and "exon".
4. start - The starting position of the feature in the sequence. The first base is numbered 1.
5. end - The ending position of the feature (inclusive).
6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1
for this annotation data set, the score value will determine the level of gray in
which this feature is displayed (higher numbers = darker gray). If there is no score
value, enter ".".
7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care).
8. frame - If the feature is a coding exon, frame should be a number between 0-2 that
represents the reading frame of the first base. If the feature is not a coding exon,
the value should be '.'.
9. group - All lines with the same group are linked together into a single item.
GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3
GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The
first eight GTF fields are the same as GFF. The group field has been expanded into a
list of attributes. Each attribute consists of a type/value pair. Attributes must end
in a semi-colon, and be separated from any following attribute by exactly one space.
GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4
We do NOT check if features (exon, CDS, etc) belonging to the same gene originally were
converted into the same chromosome/strand.
'''
if outfile is not None:
FILE_OUT = open(outfile,'w')
UNMAP = open(outfile + '.unmap', 'w')
for line in ireader.reader(ingff):
if line.startswith('#'):continue
if line.startswith('track'):continue
if line.startswith('browser'):continue
if line.startswith('visibility'):continue
if not line.strip():continue
line=line.strip()
fields=line.split('\t')
# check GFF file
#if len(fields) != 9:
# print >>sys.stderr, 'GFF file must has 9 columns. Skip ' + line
# continue
try:
start = int(fields[3]) - 1 #0-based
end = int(fields[4])/1
feature_size = end - start
except:
print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' + line, file=sys.stderr)
if outfile:
print(line, file=UNMAP)
continue
if fields[6] not in ['+','-','.']:
print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr)
if outfile:
print(line, file=UNMAP)
continue
strand = '-' if fields[6] == '-' else '+'
chrom = fields[0]
a = map_coordinates(mapping, chrom,start,end,strand)
if a is None:
if outfile is None:
print(line + '\tfail (no match to target assembly)')
else:
print(line, file=UNMAP)
continue
if len(a) !=2:
if outfile is None:
print(line + '\tfail (multpile match to target assembly)')
else:
print(line, file=UNMAP)
else:
if (int(a[1][2]) - int(a[1][1])) != feature_size: # check if it is exact match
if outfile is None:
print(line + '\tfail (not exact match)')
else:
print(line, file=UNMAP)
fields[0] = a[1][0] # chrom
fields[3] = a[1][1] + 1 # start, 1-based
fields[4] = a[1][2]
fields[6] = a[1][3]
if outfile is None:
print(line + '\t->\t' + '\t'.join([str(i) for i in fields]))
else:
print('\t'.join([str(i) for i in fields]), file=FILE_OUT)
def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3, addtag = True):
'''
Convert genome coordinates (in BAM/SAM format) between assemblies.
BAM/SAM format: http://samtools.sourceforge.net/
chrom_size is target chromosome size
if addtag is set to True, will add tags to each alignmnet:
Q = QC (QC failed)
N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly)
M = multiple mapped (alignment can be liftover to multiple places)
U = unique mapped (alignment can be liftover to only 1 place)
tags for pair-end sequencing include:
QF: QC failed
NN: both read1 and read2 unmapped
NU: read1 unmapped, read2 unique mapped
NM: read1 unmapped, multiple mapped
UN: read1 uniquely mapped, read2 unmap
UU: both read1 and read2 uniquely mapped
UM: read1 uniquely mapped, read2 multiple mapped
MN: read1 multiple mapped, read2 unmapped
MU: read1 multiple mapped, read2 unique mapped
MM: both read1 and read2 multiple mapped
tags for single-end sequencing include:
QF: QC failed
SN: unmaped
SM: multiple mapped
SU: uniquely mapped
'''
# determine the input file format (BAM or SAM)
try:
samfile = pysam.Samfile(infile,'rb')
if len(samfile.header) == 0:
print("BAM file has no header section. Exit!", file=sys.stderr)
sys.exit(1)
bam_format = True
except:
samfile = pysam.Samfile(infile,'r')
if len(samfile.header) == 0:
print("SAM file has no header section. Exit!", file=sys.stderr)
sys.exit(1)
bam_format = False
# add comments into BAM/SAM header section
if bam_format:
comments=['Liftover from original BAM file: ' + infile]
else:
comments=['Liftover from original SAM file: ' + infile]
comments.append('Liftover is based on the chain file: ' + chainfile)
sam_ori_header = samfile.header
(new_header, name_to_id) = sam_header.bam_header_generator(orig_header = sam_ori_header, chrom_size = chrom_size, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=comments)
# write to file
if outfile_prefix is not None:
if bam_format:
OUT_FILE = pysam.Samfile( outfile_prefix + '.bam', "wb", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.bam', "wb", template=samfile )
printlog (["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam'])
else:
OUT_FILE = pysam.Samfile( outfile_prefix + '.sam', "wh", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.sam', "wh", template=samfile )
printlog (["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam'])
# write to screen
else:
if bam_format:
OUT_FILE = pysam.Samfile( '-', "wb", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.bam','') + '.unmap.bam', "wb", template=samfile )
printlog (["Liftover BAM file:", infile])
else:
OUT_FILE = pysam.Samfile( '-', "w", header = new_header )
#OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.sam','') + '.unmap.sam', "wh", template=samfile )
printlog (["Liftover SAM file:", infile])
QF = 0
NN = 0
NU = 0
NM = 0
UN = 0
UU = 0
UM = 0
MN = 0
MU = 0
MM = 0
SN = 0
SM = 0
SU = 0
total_item = 0
try:
while(1):
total_item += 1
new_alignment = pysam.AlignedRead() # create AlignedRead object
old_alignment = next(samfile)
# qcfailed reads will be written to OUT_FILE_UNMAP for both SE and PE reads
if old_alignment.is_qcfail:
QF += 1
if addtag: old_alignment.set_tag(tag="QF", value=0)
OUT_FILE.write(old_alignment)
continue
# new_alignment.qqual = old_alignment.qqual # quality string of read, exclude soft clipped part
# new_alignment.qlen = old_alignment.qlen # length of the "aligned part of read"
# new_alignment. aligned_pairs = old_alignment. aligned_pairs #read coordinates <=> ref coordinates
# new_alignment_aend = old_alignment.aend # reference End position of the aligned part (of read)
#new_alignment.qname = old_alignment.qname # 1st column. read name.
#new_alignment.flag = old_alignment.flag # 2nd column. subject to change. flag value
#new_alignment.tid = old_alignment.tid # 3rd column. samfile.getrname(tid) == chrom name
#new_alignment.pos = old_alignment.pos # 4th column. reference Start position of the aligned part (of read) [0-based]
#new_alignment.mapq = old_alignment.mapq # 5th column. mapping quality
#new_alignment.cigar= old_alignment.cigar # 6th column. subject to change.
#new_alignment.rnext = old_alignment.rnext # 7th column. tid of the reference (mate read mapped to)
#new_alignment.pnext = old_alignment.pnext # 8th column. position of the reference (0 based, mate read mapped to)
#new_alignment.tlen = old_alignment.tlen # 9th column. insert size
#new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases.
#new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases.
#new_alignment.tags = old_alignment.tags # 12 - columns
new_alignment.qname = old_alignment.qname # 1st column. read name.
new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases.
new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases.
new_alignment.tags = old_alignment.tags # 12 - columns
# by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes
# Thanks Wolfgang Resch <wresch@helix.nih.gov> identified this bug and provided solution.
try:
rg, rgt = old_alignment.get_tag("RG", with_value_type=True)
except KeyError:
pass
else:
new_alignment.set_tag("RG", str(rg), rgt)
if old_alignment.is_paired:
new_alignment.flag = 0x0001 #pair-end
if old_alignment.is_read1:
new_alignment.flag = new_alignment.flag | 0x40
elif old_alignment.is_read2:
new_alignment.flag = new_alignment.flag | 0x80
#==================================
# R1 is originally unmapped
#==================================
if old_alignment.is_unmapped:
#This line will set the unmapped read flag for the first read in pair (in case it is unmapped).
#Thanks Amir Keivan Mohtashami reporting this bug.
new_alignment.flag = new_alignment.flag | 0x4
#------------------------------------
# both R1 and R2 unmapped
#------------------------------------
if old_alignment.mate_is_unmapped:
NN += 1
if addtag: old_alignment.set_tag(tag="NN", value=0)
OUT_FILE.write(old_alignment)
continue
else: # originally, read-1 unmapped, read-2 is mapped
try:
read2_chr = samfile.getrname(old_alignment.rnext)
read2_strand = '-' if old_alignment.mate_is_reverse else '+'
read2_start = old_alignment.pnext
read2_end = read2_start + 1
read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand)
# [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )]
except:
read2_maps = None
#------------------------------------
# both R1 and R2 unmapped
#------------------------------------
if read2_maps is None:
NN += 1
if addtag: old_alignment.set_tag(tag="NN", value=0)
OUT_FILE.write(old_alignment)
continue
#------------------------------------
# R1 unmapped, R2 unique
#------------------------------------
elif len(read2_maps) == 2:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 3
new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's
# 4
new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's
# 5
new_alignment.mapq = old_alignment.mapq
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]]
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = 0
NU += 1
if addtag: new_alignment.set_tag(tag="NU", value=0)
OUT_FILE.write(new_alignment)
continue
#------------------------------------
# R1 unmapped, R2 multiple
#------------------------------------
else:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 2
new_alignment.flag = new_alignment.flag | 0x100
# 3
new_alignment.tid = name_to_id[read2_maps[1][0]]
# 4
new_alignment.pos = read2_maps[1][1]
# 5
new_alignment.mapq = old_alignment.mapq
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]]
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = 0
NM += 1
if addtag: new_alignment.set_tag(tag="NM", value=0)
OUT_FILE.write(new_alignment)
continue
#==================================
# R1 is originally mapped
#==================================
else:
try:
read1_chr = samfile.getrname(old_alignment.tid)
read1_strand = '-' if old_alignment.is_reverse else '+'
read1_start = old_alignment.pos
read1_end = old_alignment.aend
read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand)
except:
read1_maps = None
if not old_alignment.mate_is_unmapped:
try:
read2_chr = samfile.getrname(old_alignment.rnext)
read2_strand = '-' if old_alignment.mate_is_reverse else '+'
read2_start = old_alignment.pnext
read2_end = read2_start + 1
read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand)
# [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )]
except:
read2_maps = None
#------------------------------------
# R1 unmapped (failed to liftover)
#------------------------------------
if read1_maps is None:
# 2 update flag (0x4: segment unmapped)
new_alignment.flag = new_alignment.flag | 0x4
# 3
new_alignment.tid = -1
# 4
new_alignment.pos = 0
# 5
new_alignment.mapq = 255
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = -1
# 8
new_alignment.pnext = 0
# 9
new_alignment.tlen = 0
# (1) read2 is unmapped before conversion
if old_alignment.mate_is_unmapped:
NN += 1
if addtag: new_alignment.set_tag(tag="NN", value=0)
OUT_FILE.write(new_alignment)
continue
# (2) read2 is unmapped after conversion
elif read2_maps is None:
# 2
new_alignment.flag = new_alignment.flag | 0x8
NN += 1
if addtag: new_alignment.set_tag(tag="NN", value=0)
OUT_FILE.write(new_alignment)
continue
# (3) read2 is unique mapped
elif len(read2_maps) == 2:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 3
new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's
# 4
new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's
# 5
new_alignment.mapq = old_alignment.mapq
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]]
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = 0
NU += 1
if addtag: new_alignment.set_tag(tag="NU", value=0)
OUT_FILE.write(new_alignment)
continue
# (4) read2 is multiple mapped
else:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 2
new_alignment.flag = new_alignment.flag | 0x100
# 3
new_alignment.tid = name_to_id[read2_maps[1][0]]
# 4
new_alignment.pos = read2_maps[1][1]
# 5
new_alignment.mapq = 255 # mapq not available
# 6
new_alignment.cigar = old_alignment.cigar
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]]
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = 0
NM += 1
if addtag:new_alignment.set_tag(tag="NM", value=0)
OUT_FILE.write(new_alignment)
continue
#------------------------------------
# R1 uniquely mapped
#------------------------------------
elif len(read1_maps) == 2:
if read1_maps[1][3] == '-':
new_alignment.flag = new_alignment.flag | 0x10
if read1_maps[0][3] != read1_maps[1][3]: # opposite strand
# 6
new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple
# 10
new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence
# 11
new_alignment.qual = old_alignment.qual[::-1] #reverse quality string
elif read1_maps[0][3] == read1_maps[1][3]: # same strand
# 6
new_alignment.cigar = old_alignment.cigar
# 3
new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom
# 4
new_alignment.pos = read1_maps[1][1] #start
# 5
new_alignment.mapq = old_alignment.mapq
# (1) R2 unmapped before or after conversion
if (old_alignment.mate_is_unmapped) or (read2_maps is None):
# 2
new_alignment.flag = new_alignment.flag | 0x8
# 7
new_alignment.rnext = name_to_id[read1_maps[1][0]]
# 8
new_alignment.pnext = read1_maps[1][1]
# 9
new_alignment.tlen = 0
UN += 1
if addtag: new_alignment.set_tag(tag="UN", value=0)
OUT_FILE.write(new_alignment)
continue
# (2) R2 is unique mapped
elif len(read2_maps)==2:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom
# 8
new_alignment.pnext = read2_maps[1][1]
# 9
new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen
# 2
if (read2_maps[1][3] != read1_maps[1][3]) and (new_alignment.tlen <= IS_size + fold * IS_std) and (new_alignment.tlen >= IS_size - fold * IS_std):
new_alignment.flag = new_alignment.flag | 0x2
UU += 1
if addtag: new_alignment.set_tag(tag="UU", value=0)
OUT_FILE.write(new_alignment)
continue
# (3) R2 is multiple mapped
else:
# 2 (strand)
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 2 (secondary alignment)
new_alignment.flag = new_alignment.flag | 0x100
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen
UM += 1
if addtag: new_alignment.set_tag(tag="UM", value=0)
OUT_FILE.write(new_alignment)
continue
#------------------------------------
# R1 multiple mapped
#-----------------------------------
elif len(read1_maps) > 2 and len(read1_maps) % 2 ==0:
# 2
new_alignment.flag = new_alignment.flag | 0x100
# 2
if read1_maps[1][3] == '-':
new_alignment.flag = new_alignment.flag | 0x10
if read1_maps[0][3] != read1_maps[1][3]:
# 6
new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple
# 10
new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence
# 11
new_alignment.qual = old_alignment.qual[::-1] #reverse quality string
elif read1_maps[0][3] == read1_maps[1][3]:
new_alignment.cigar = old_alignment.cigar
# 3
new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom
# 4
new_alignment.pos = read1_maps[1][1] #start
# 5
new_alignment.mapq = 255
# (1) R2 is unmapped
if (old_alignment.mate_is_unmapped) or (read2_maps is None):
# 2
new_alignment.flag = new_alignment.flag | 0x8
# 7
new_alignment.rnext = name_to_id[read1_maps[1][0]]
# 8
new_alignment.pnext = read1_maps[1][1]
# 9
new_alignment.tlen = 0
MN += 1
if addtag: new_alignment.set_tag(tag="MN", value=0)
OUT_FILE.write(new_alignment)
continue
# (2) read2 is unique mapped
elif len(read2_maps)==2:
# 2
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom
# 8
new_alignment.pnext = read2_maps[1][1]
# 9
new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen
MU += 1
if addtag: new_alignment.set_tag(tag="MU", value=0)
OUT_FILE.write(new_alignment)
continue
# (3) R2 is multiple mapped
else:
# 2 (strand)
if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20
# 2 (secondary alignment)
new_alignment.flag = new_alignment.flag | 0x100
# 7
new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom
# 8
new_alignment.pnext = read2_maps[1][1] #start
# 9
new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen
MM += 1
if addtag: new_alignment.set_tag(tag="MM", value=0)
OUT_FILE.write(new_alignment)
continue
else:
#old_alignment.tid = name_to_id[samfile.getrname(old_alignment.tid)]
OUT_FILE_UNMAP.write(old_alignment)
failed += 1
# Singel end sequencing
else:
# (1) originally unmapped
if old_alignment.is_unmapped:
SN += 1
if addtag: old_alignment.set_tag(tag="SN",value=0)
OUT_FILE.write(old_alignment)
continue
else:
new_alignment.flag = 0x0
read_chr = samfile.getrname(old_alignment.tid)
read_strand = '-' if old_alignment.is_reverse else '+'
read_start = old_alignment.pos
read_end = old_alignment.aend
read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand)
# (2) unmapped afte liftover
if read_maps is None:
# 1
new_alignment.qname = old_alignment.qname
# 2
new_alignment.flag = new_alignment.flag | 0x4
# 3
new_alignment.tid = -1
# 4
new_alignment.pos = 0
# 5
new_alignment.mapq = 255
# 6
new_alignment.cigar= old_alignment.cigar
# 7
new_alignment.rnext = -1
# 8
new_alignment.pnext = 0
# 9
new_alignment.tlen = 0
# 10
new_alignment.seq = old_alignment.seq
# 11
new_alignment.qual = old_alignment.qual
# 12
new_alignment.tags = old_alignment.tags
SN += 1
if addtag: new_alignment.set_tag(tag="SN",value=0)
OUT_FILE.write(new_alignment)
continue
# (3) unique mapped
if len(read_maps)==2:
# 1
new_alignment.qname = old_alignment.qname
# 2
if read_maps[1][3] == '-':
new_alignment.flag = new_alignment.flag | 0x10
if read_maps[0][3] != read_maps[1][3]:
# 6
new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple
# 10
new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence
# 11
new_alignment.qual = old_alignment.qual[::-1] #reverse quality string
else:
# 6
new_alignment.cigar = old_alignment.cigar
# 10
new_alignment.seq = old_alignment.seq
# 11
new_alignment.qual = old_alignment.qual
# 3
new_alignment.tid = name_to_id[read_maps[1][0]]
# 4
new_alignment.pos = read_maps[1][1]
# 5
new_alignment.mapq = old_alignment.mapq
# 7
new_alignment.rnext = -1
# 8
new_alignment.pnext = 0
# 9
new_alignment.tlen = 0
SU += 1
if addtag: new_alignment.set_tag(tag="SU",value=0)
OUT_FILE.write(new_alignment)
continue
# (4) multiple mapped
if len(read_maps) > 2 and len(read_maps) % 2 ==0:
# 1
new_alignment.qname = old_alignment.qname
new_alignment.flag = new_alignment.flag | 0x100
# 2
if read_maps[1][3] == '-':
new_alignment.flag = new_alignment.flag | 0x10
if read_maps[0][3] != read_maps[1][3]:
# 6
new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple
# 10
new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence
# 11
new_alignment.qual = old_alignment.qual[::-1] #reverse quality string
else:
# 6
new_alignment.cigar = old_alignment.cigar
# 10
new_alignment.seq = old_alignment.seq
# 11
new_alignment.qual = old_alignment.qual
# 3
new_alignment.tid = name_to_id[read_maps[1][0]]
# 4
new_alignment.pos = read_maps[1][1]
# 5
new_alignment.mapq = old_alignment.mapq
# 7
new_alignment.rnext = -1
# 8
new_alignment.pnext = 0
# 9
new_alignment.tlen = 0
SM += 1
if addtag: new_alignment.set_tag(tag="SM",value=0)
OUT_FILE.write(new_alignment)
continue
except StopIteration:
printlog(["Done!"])
OUT_FILE.close()
if outfile_prefix is not None:
if bam_format:
try:
printlog (['Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')])
pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam')
except:
printlog(["Warning: ","output BAM file was NOT sorted"])
try:
printlog (['Index "%s" ...' % (outfile_prefix + '.sorted.bam')])
pysam.index(outfile_prefix + '.sorted.bam',outfile_prefix + '.sorted.bam.bai')
except:
printlog(["Warning: ","output BAM file was NOT indexed."])
print("Total alignments:" + str(total_item-1))
print("\tQC failed: " + str(QF))
if max(NN,NU, NM, UN, UU, UM, MN, MU, MM) > 0:
print("\tR1 unique, R2 unique (UU): " + str(UU))
print("\tR1 unique, R2 unmapp (UN): " + str(UN))
print("\tR1 unique, R2 multiple (UM): " + str(UM))
print("\tR1 multiple, R2 multiple (MM): " + str(MM))
print("\tR1 multiple, R2 unique (MU): " + str(MU))
print("\tR1 multiple, R2 unmapped (MN): " + str(MN))
print("\tR1 unmap, R2 unmap (NN): " + str(NN))
print("\tR1 unmap, R2 unique (NU): " + str(NU))
print("\tR1 unmap, R2 multiple (NM): " + str(NM))
if max(SN,SU,SM) > 0:
print("\tUniquley mapped (SU): " + str(SU))
print("\tMultiple mapped (SM): " + str(SM))
print("\tUnmapped (SN): " + str(SN))
def crossmap_wig_file(mapping, in_file, out_prefix, taget_chrom_size, in_format, binSize=100000):
'''
Convert genome coordinates (in wiggle/bigwig format) between assemblies.
wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html
bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html
'''
OUT_FILE1 = open(out_prefix + '.bgr','w') # original bgr file
OUT_FILE2 = open(out_prefix + '.sorted.bgr','w') # sorted bgr file
OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w") # bigwig file
if in_format.upper() == "WIGGLE":
printlog (["Liftover wiggle file:", in_file, '==>', out_prefix + '.bgr'])
for chr, start, end, strand, score in wiggleReader (in_file):
#print chr,start,end,score
# map_coordinates(mapping, q_chr, q_start, q_end, q_strand = '+', print_match=False):
maps = map_coordinates(mapping, chr, start, end, '+')
if maps is None:
continue
if len(maps) == 2:
print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1)
else:
continue
maps[:]=[]
OUT_FILE1.close()
printlog (["Merging overlapped entries in bedGraph file ..."])
for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'):
print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2)
OUT_FILE2.close()
os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr
# add header to bigwig file
printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ])
target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())]
OUT_FILE3.addHeader(target_chroms_sorted)
printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ])
# add entries to bigwig file
for line in ireader.reader(out_prefix + '.sorted.bgr'):
r_chr,r_st,r_end,r_value = line.split()
OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)])
OUT_FILE3.close()
elif in_format.upper() == "BIGWIG":
printlog (["Liftover bigwig file:", in_file, '==>', out_prefix + '.bgr'])
for chr, start, end, score in bigwigReader(in_file, bin_size = binSize ):
maps = map_coordinates(mapping, chr, start, end, '+')
try:
if maps is None: continue
if len(maps) == 2:
print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1)
else:
continue
except:
continue
maps[:]=[]
OUT_FILE1.close()
printlog (["Merging overlapped entries in bedGraph file ..."])
for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'):
print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2)
OUT_FILE2.close()
os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr
# add header to bigwig file
printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ])
target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())]
OUT_FILE3.addHeader(target_chroms_sorted)
# add entries to bigwig file
printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ])
for line in ireader.reader(out_prefix + '.sorted.bgr'):
r_chr,r_st,r_end,r_value = line.split()
OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)])
OUT_FILE3.close()
else:
raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'")
def general_help():
desc="""CrossMap is a program for convenient conversion of genome coordinates and genome \
annotation files between assemblies (eg. lift from human hg18 to hg19 or vice versa). \
It supports file in BAM, SAM, BED, Wiggle, BigWig, GFF, GTF and VCF format."""
print("Program: %s (v%s)" % ("CrossMap", __version__), file=sys.stderr)
print("\nDescription: \n%s" % '\n'.join(' '+i for i in wrap(desc,width=80)), file=sys.stderr)
print("\nUsage: CrossMap.py <command> [options]\n", file=sys.stderr)
for k in sorted(commands):
print(' ' + k + '\t' + commands[k], file=sys.stderr)
print(file=sys.stderr)
def bed_help():
msg =[
('Usage:', "CrossMap.py bed input_chain_file input_bed_file [output_file]"),
('Description:', "\"input_chain_file\" and \"input_bed_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. BED format file must have at least 3 columns (chrom, start, end) and no more than 12 columns. If no \"output_file\" is specified, output will be directed to the screen (console). BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1"),
('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed test.hg19.bed # write output to \"test.hg19.bed\""),
('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed # write output to screen"),
]
for i,j in msg:
print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr)
def gff_help():
msg =[
('Usage:', "CrossMap.py gff input_chain_file input_gff_file output_file"),
('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Input file must be in GFF or GTF format. GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4"),
('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf test.hg18.gtf # write output to \"test.hg18.gff\""),
('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf # write output to screen"),
]
for i,j in msg:
print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr)
def wig_help():
msg =[
('Usage:', "CrossMap.py wig input_chain_file input_wig_file output_prefix"),
('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Both \"variableStep\" and \"fixedStep\" wiggle lines are supported. Wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html"),
('Example:', "CrossMapy.py wig hg18ToHg19.over.chain.gz test.hg18.wig test.hg19"),
]
for i,j in msg:
print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr)
def bigwig_help():
msg =[
('Usage:', "CrossMap.py bigwig input_chain_file input__bigwig_file output_prefix"),
('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html"),
('Example:', "CrossMapy.py bigwig hg18ToHg19.over.chain.gz test.hg18.bw test.hg19"),
]
for i,j in msg:
print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr)
def bam_help():
usage="CrossMap.py bam -a input_chain_file input_bam_file output_file [options] "
import optparse
parser.print_help()
def vcf_help():
msg =[
("usage:","CrossMap.py vcf input_chain_file input_VCF_file ref_genome_file output_file"),
("Description:", "\"input_chain_file\" and \"input_VCF_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. \"ref_genome_file\" is genome sequence file of 'target assembly' in FASTA format."),
("Example:", " CrossMap.py vcf hg19ToHg18.over.chain.gz test.hg19.vcf hg18.fa test.hg18.vcf"),
]
for i,j in msg:
print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr)
if __name__=='__main__':
# test read_chain_file()
#(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(sys.argv[1], print_table=True)
#q_chr, q_start, q_end, q_strand = '+', print_match=False):
#map_coordinates(mapTree, 'chr1', 45953452, 45953456, '+', print_match=True)
#a= sam_header.bam_header_generator(chrom_size = targetChromSizes, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=['a','b','c'])
#crossmap_bed_file(mapTree, 'test.hg18.bed3',outfile='aa')
#crossmap_bed_file(mapTree, 'test.bed')
#crossmap_gff_file(mapTree,'test.hg18.gtf',outfile = 'aa')
#crossmap_bam_file(mapTree, targetChromSizes, sys.argv[1], 'eg.abnormal.sam','out')
#printlog (['Sort output bam file ...'])
#pysam.sort('-m','1000000000','out.bam','out.sorted')
#printlog (['Index bam file ...'])
#pysam.index('out.sorted.bam','out.sorted.bam.bai')
#crossmap_wig_file(mapTree, 'GSM686950_reverse.hg18.bw', 'GSM686950_reverse.hg19', sourceChromSizes, targetChromSizes, in_format = 'bigwig')
#parser = optparse.OptionParser()
#parser.add_option("-i","--input-sam",action="store",type="string",dest="input_sam_file", help="BAM/SAM format file.")
#parser.add_option("-o","--out-file",action="store",type="string",dest="output_sam_file", help="Prefix of output files. Foramt is determined from input: SAM <=> SAM, BAM <=> BAM")
#parser.add_option("-m","--insert-mean",action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). Used to tell wheather a mapped pair is \"proper pair\" or not. [default=%default]")
#parser.add_option("-s","--insert-stdev",action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" )
#parser.add_option("-t","--times-of-stdev",action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]")
#(options,args)=parser.parse_args()
#print options
#print args
commands = {
'bed':'convert genome cooridnate or annotation file in BED, bedGraph or other BED-like format.',
'bam':'convert alignment file in BAM or SAM format.',
'gff':'convert genome cooridnate or annotation file in GFF or GTF format.',
'wig':'convert genome coordinate file in Wiggle, or bedGraph format.',
'bigwig':'convert genome coordinate file in BigWig format.',
'vcf':'convert genome coordinate file in VCF format.'
}
kwds = list(commands.keys())
if len(sys.argv) == 1:
general_help()
sys.exit(0)
elif len(sys.argv) >=2:
# deal with bed input
if sys.argv[1].lower() == 'bed':
if len(sys.argv) == 4:
chain_file = sys.argv[2]
in_file = sys.argv[3]
out_file = None
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file, print_table = False)
crossmap_bed_file(mapTree, in_file, out_file)
elif len(sys.argv) == 5:
chain_file = sys.argv[2]
in_file = sys.argv[3]
out_file = sys.argv[4]
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
crossmap_bed_file(mapTree, in_file, out_file)
else:
bed_help()
sys.exit(0)
elif sys.argv[1].lower() == 'gff':
if len(sys.argv) == 4:
chain_file = sys.argv[2]
in_file = sys.argv[3]
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
crossmap_gff_file(mapTree, in_file, None)
elif len(sys.argv) == 5:
chain_file = sys.argv[2]
in_file = sys.argv[3]
out_file = sys.argv[4]
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
crossmap_gff_file(mapTree, in_file, out_file)
else:
gff_help()
sys.exit(0)
elif sys.argv[1].lower() == 'wig':
if len(sys.argv) == 5:
chain_file = sys.argv[2]
in_file = sys.argv[3]
out_file = sys.argv[4]
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'wiggle')
else:
wig_help()
sys.exit(0)
elif sys.argv[1].lower() == 'bigwig':
if len(sys.argv) == 5:
chain_file = sys.argv[2]
in_file = sys.argv[3]
try:
bw = pyBigWig.open(in_file)
except:
print ("\nPlease check if \"%s\" is in bigWig format!\n" % in_file, file=sys.stderr)
sys.exit(0)
out_file = sys.argv[4]
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'bigwig')
else:
bigwig_help()
sys.exit(0)
elif sys.argv[1].lower() == 'bam':
#def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3):
usage="CrossMap.py bam input_chain_file input_bam_file output_file [options]\nNote: If output_file == STDOUT or -, CrossMap will write BAM file to the screen"
parser = optparse.OptionParser(usage, add_help_option=False)
parser.add_option("-m", "--mean", action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). [default=%default]")
parser.add_option("-s", "--stdev", action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" )
parser.add_option("-t", "--times", action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]")
parser.add_option("-a","--append-tags",action="store_true",dest="add_tags",help="Add tag to each alignment.")
(options,args)=parser.parse_args()
if len(args) >= 3:
print("Insert size = %f" % (options.insert_size), file=sys.stderr)
print("Insert size stdev = %f" % (options.insert_size_stdev), file=sys.stderr)
print("Number of stdev from the mean = %f" % (options.insert_size_fold), file=sys.stderr)
if options.add_tags:
print("Add tags to each alignment = %s" % ( options.add_tags), file=sys.stderr)
else:
print("Add tags to each alignment = %s" % ( False), file=sys.stderr)
chain_file = args[1]
in_file = args[2]
out_file = args[3] if len(args) >= 4 else None
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
if out_file in ["STDOUT","-"]:
out_file = None
if options.add_tags:
crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=True)
else:
crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=False)
else:
parser.print_help()
elif sys.argv[1].lower() == 'vcf':
if len(sys.argv) == 6:
chain_file = sys.argv[2]
in_file = sys.argv[3]
genome_file = sys.argv[4]
out_file = sys.argv[5]
(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
crossmap_vcf_file(mapping = mapTree, infile= in_file, outfile = out_file, liftoverfile = sys.argv[2], refgenome = genome_file)
else:
vcf_help()
sys.exit(0)
else:
general_help()
sys.exit(0)