diff --git a/README.md b/README.md index 02f4ec4..7eb9268 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Required arguments: --bed Path to BED-file --genome_fasta Path to genome in FASTA-format --jaspar_db Path to motif-database in MEME-format + --organism STRING Source organism: [ hg19 | hg 38 or mm9 | mm10 ] Optional arguments: @@ -86,9 +87,10 @@ Optional arguments: --motif_similarity_thresh FLOAT threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [homo_sapiens | mus_musculus] - --tissues - + --tissues STRING Filter for tissue/category activity categorys as in JSON config (Default: None) + --wd PATH current working directory for the script (default: ".") + --dir PATH directory for saved Ensembl / UCSC file (default: "./data/") + --out PATH directory for the output GTF (default: ".") All arguments can be set in the configuration files. ``` diff --git a/bin/Modules/CrossMap.py b/bin/Modules/CrossMap.py new file mode 100644 index 0000000..97cfe0b --- /dev/null +++ b/bin/Modules/CrossMap.py @@ -0,0 +1,1759 @@ +#!/home/basti/miniconda3/envs/mpi-project/bin/python +''' +--------------------------------------------------------------------------------------- +CrossMap: lift over genomic coordinates between genome assemblies. +Supports BED/BedGraph, GFF/GTF, BAM/SAM, BigWig/Wig, VCF format files. +------------------------------------------------------------------------------------------ +''' + +import os,sys +import optparse +import collections +import subprocess +import string +from textwrap import wrap +from time import strftime + +import pyBigWig +import pysam +from bx.intervals import * +import numpy as np +import datetime +from cmmodule import ireader +from cmmodule import BED +from cmmodule import annoGene +from cmmodule import sam_header +from cmmodule import myutils +from cmmodule import bgrMerge + +__author__ = "Liguo Wang, Hao Zhao" +__contributor__="Liguo Wang, Hao Zhao" +__copyright__ = "Copyleft" +__credits__ = [] +__license__ = "GPLv2" +__version__="0.3.1" +__maintainer__ = "Liguo Wang" +__email__ = "wangliguo78@gmail.com" +__status__ = "Production" + +def printlog (mesg_lst): + ''' + print progress into stderr + ''' + if len(mesg_lst)==1: + msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg_lst[0] + else: + msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + ' '.join(mesg_lst) + print(msg, file=sys.stderr) + +def parse_header( line ): + return dict( [ field.split( '=' ) for field in line.split()[1:] ] ) + +def revcomp_DNA(dna): + ''' + reverse complement of input DNA sequence. + ''' + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A','N':'N','X':'X'} + seq = dna.upper() + return ''.join(complement[base] for base in reversed(seq)) + +def wiggleReader( f ): + ''' + Iterator yielding chrom, start, end, strand, value. + Values are zero-based, half-open. + Regions which lack scores are ignored. + ''' + current_chrom = None + current_pos = None + current_step = None + + # always for wiggle data + strand = '+' + + mode = "bed" + for line in ireader.reader(f): + if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ): + continue + elif line.startswith( "variableStep" ): + header = parse_header( line ) + current_chrom = header['chrom'] + current_pos = None + current_step = None + if 'span' in header: current_span = int( header['span'] ) + else: current_span = 1 + mode = "variableStep" + elif line.startswith( "fixedStep" ): + header = parse_header( line ) + current_chrom = header['chrom'] + current_pos = int( header['start'] ) - 1 + current_step = int( header['step'] ) + if 'span' in header: current_span = int( header['span'] ) + else: current_span = 1 + mode = "fixedStep" + elif mode == "bed": + fields = line.split() + if len( fields ) > 3: + if len( fields ) > 5: + yield fields[0], int( fields[1] ), int( fields[2] ), fields[5], float( fields[3] ) + else: + yield fields[0], int( fields[1] ), int( fields[2] ), strand, float( fields[3] ) + elif mode == "variableStep": + fields = line.split() + pos = int( fields[0] ) - 1 + yield current_chrom, pos, pos + current_span, strand, float( fields[1] ) + elif mode == "fixedStep": + yield current_chrom, current_pos, current_pos + current_span, strand, float( line.split()[0] ) + current_pos += current_step + else: + raise "Unexpected input line: %s" % line.strip() + +def bigwigReader(infile, bin_size = 2000): + ''' + infile: bigwig format file + chromsize: chrom_name: size. + return: chrom, position (0-based), value + ''' + + bw = pyBigWig.open(infile) + chrom_sizes = bw.chroms() + #print (chrom_sizes) + + for chr_name, chr_size in list(chrom_sizes.items()): + for chrom, st, end in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = bin_size): + try: + a = bw.stats(chrom,st,end) + except: + continue + + if bw.stats(chrom,st,end)[0] is None: + continue + sig_list = bw.values(chrom,st,end) + sig_list = np.nan_to_num(sig_list) #change 'nan' into '0.' + low_bound = st + point_num = 1 + score = sig_list[0] + for value in (sig_list[1:]): + if value == score: + point_num += 1 + else: + yield(( chrom, low_bound, low_bound + point_num, score)) + score = value + low_bound = low_bound + point_num + point_num = 1 + + +def check_bed12(bedline): + ''' + check if bed12 format is correct or not + ''' + fields = bedline.strip().split() + if len(fields) !=12: + return False + if fields[5] not in ['+','-','.']: + return False + try: + chromStart = int(fields[1]) + chromEnd = int(fields[2]) + thickStart = int(fields[6]) + thickEnd = int(fields[7]) + blockCount = int(fields[9]) + blockSizes = [int(i) for i in fields[10].rstrip(',').split(',')] + blockStarts = [int(i) for i in fields[11].rstrip(',').split(',')] + except: + return False + if chromStart > chromEnd or thickStart > thickEnd: + return False + if thickStart < chromStart or thickEnd > chromEnd: + return False + if len(blockSizes) != blockCount: + return False + if len(blockStarts) != blockCount: + return False + if blockCount <1: + return False + for i in blockSizes: + if i < 0: return False + for i in blockStarts: + if i < 0: return False + return True + +def intersectBed(xxx_todo_changeme, xxx_todo_changeme1): + ''' + return intersection of two bed regions + ''' + (chr1, st1, end1) = xxx_todo_changeme + (chr2, st2, end2) = xxx_todo_changeme1 + if int(st1) > int(end1) or int(st2) > int(end2): + raise Exception ("Start cannot be larger than end") + if chr1 != chr2: + return None + if int(st1) > int(end2) or int(end1) < int(st2): + return None + return (chr1, max(st1, st2), min(end1,end2)) + +def read_chain_file (chain_file, print_table=False): + ''' + input chain_file could be either plain text, compressed file (".gz", ".Z", ".z", ".bz", ".bz2", ".bzip2"), + or a URL pointing to the chain file ("http://", "https://", "ftp://"). If url was used, chain file must be plain text + ''' + + printlog(["Read chain_file: ", chain_file]), + maps={} + target_chromSize={} + source_chromSize={} + if print_table: + blocks=[] + + for line in ireader.reader(chain_file): + # Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1 + if not line.strip(): + continue + line=line.strip() + if line.startswith(('#',' ')):continue + fields = line.split() + + if fields[0] == 'chain' and len(fields) in [12, 13]: + score = int(fields[1]) # Alignment score + source_name = fields[2] # E.g. chrY + source_size = int(fields[3]) # Full length of the chromosome + source_strand = fields[4] # Must be + + if source_strand != '+': + raise Exception("Source strand in an .over.chain file must be +. (%s)" % line) + source_start = int(fields[5]) # Start of source region + source_end = int(fields[6]) # End of source region + + target_name = fields[7] # E.g. chr5 + target_size = int(fields[8]) # Full length of the chromosome + target_strand = fields[9] # + or - + target_start = int(fields[10]) + target_end = int(fields[11]) + target_chromSize[target_name]= target_size + source_chromSize[source_name] = source_size + + if target_strand not in ['+', '-']: + raise Exception("Target strand must be - or +. (%s)" % line) + #if target_strand == '+': + # target_start = int(fields[10]) + # target_end = int(fields[11]) + #if target_strand == '-': + # target_start = target_size - target_end + # target_end = target_size - target_start + chain_id = None if len(fields) == 12 else fields[12] + if source_name not in maps: + maps[source_name] = Intersecter() + + sfrom, tfrom = source_start, target_start + + # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to) + elif fields[0] != 'chain' and len(fields) == 3: + size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2]) + if print_table: + if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand)) + elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand)) + + if target_strand == '+': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand))) + elif target_strand == '-': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand))) + + sfrom += size + sgap + tfrom += size + tgap + + elif fields[0] != 'chain' and len(fields) == 1: + size = int(fields[0]) + if print_table: + if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand)) + elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand)) + + if target_strand == '+': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand))) + elif target_strand == '-': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand))) + else: + raise Exception("Invalid chain format. (%s)" % line) + if (sfrom + size) != source_end or (tfrom + size) != target_end: + raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header) + + if print_table: + for i in blocks: + print('\t'.join([str(n) for n in i])) + + return (maps,target_chromSize, source_chromSize) + + +def map_coordinates(mapping, q_chr, q_start, q_end, q_strand='+', print_match=False): + ''' + Map coordinates from source assembly to target assembly + ''' + + matches=[] + complement ={'+':'-','-':'+'} + if q_chr not in mapping: + return None + else: + targets = mapping[q_chr].find(q_start, q_end) + if len(targets)==0: + return None + elif len(targets)==1: + s_start = targets[0].start + s_end = targets[0].end + t_chrom = targets[0].value[0] + t_start = targets[0].value[1] + t_end = targets[0].value[2] + t_strand = targets[0].value[3] + + (chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end)) + l_offset = abs(real_start - s_start) + #r_offset = real_end - s_end + size = abs(real_end - real_start) + + matches.append( (chr, real_start, real_end,q_strand)) + if t_strand == '+': + i_start = t_start + l_offset + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + elif t_strand == '-': + i_start = t_end - l_offset - size + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + else: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand) + + elif len(targets) > 1: + for t in targets: + s_start = t.start + s_end = t.end + t_chrom = t.value[0] + t_start = t.value[1] + t_end = t.value[2] + t_strand = t.value[3] + + (chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end)) + + l_offset = abs(real_start - s_start) + #r_offset = abs(real_end - s_end) + size = abs(real_end - real_start) + + matches.append( (chr, real_start, real_end,q_strand) ) + if t_strand == '+': + i_start = t_start + l_offset + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + elif t_strand == '-': + i_start = t_end - l_offset - size + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + else: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand) + + if print_match: + print(matches) + # input: 'chr1',246974830,247024835 + # output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')] + # [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)] + + return matches + +def crossmap_vcf_file(mapping, infile,outfile, liftoverfile, refgenome): + ''' + Convert genome coordinates in VCF format. + ''' + + #index refegenome file if it hasn't been done + if not os.path.exists(refgenome + '.fai'): + printlog(["Creating index for", refgenome]) + pysam.faidx(refgenome) + + refFasta = pysam.Fastafile(refgenome) + + FILE_OUT = open(outfile ,'w') + UNMAP = open(outfile + '.unmap','w') + + total = 0 + fail = 0 + withChr = False # check if the VCF data lines use 'chr1' or '1' + + for line in ireader.reader(infile): + if not line.strip(): + continue + line=line.strip() + + #deal with meta-information lines. + + #meta-information lines needed in both mapped and unmapped files + if line.startswith('##fileformat'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##INFO'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##FILTER'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##FORMAT'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##ALT'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##SAMPLE'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##PEDIGREE'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + + #meta-information lines needed in unmapped files + elif line.startswith('##assembly'): + print(line, file=UNMAP) + elif line.startswith('##contig'): + print(line, file=UNMAP) + if 'ID=chr' in line: + withChr = True + + #update contig information + elif line.startswith('#CHROM'): + printlog(["Updating contig field ... "]) + target_gsize = dict(list(zip(refFasta.references, refFasta.lengths))) + for chr_id in sorted(target_gsize): + if chr_id.startswith('chr'): + if withChr is True: + print("##contig=" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + print("##contig=" % (chr_id.replace('chr',''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + if withChr is True: + print("##contig=" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + print("##contig=" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + + print("##liftOverProgram=CrossMap(https://sourceforge.net/projects/crossmap/)", file=FILE_OUT) + print("##liftOverFile=" + liftoverfile, file=FILE_OUT) + print("##new_reference_genome=" + refgenome, file=FILE_OUT) + print("##liftOverTime=" + datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) + print(line, file=FILE_OUT) + print(line, file=UNMAP) + + else: + if line.startswith('#'):continue + fields = str.split(line,maxsplit=7) + total += 1 + + if fields[0].startswith('chr'): + chrom = fields[0] + else: + chrom = 'chr' + fields[0] + + start = int(fields[1])-1 # 0 based + end = start + len(fields[3]) + a = map_coordinates(mapping, chrom, start, end,'+') + if a is None: + print(line, file=UNMAP) + fail += 1 + continue + + if len(a) == 2: + # update chrom + target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' + target_start = a[1][1] + target_end = a[1][2] + if withChr is False: + fields[0] = target_chr.replace('chr','') + else: + fields[0] = target_chr + + # update start coordinate + fields[1] = target_start + 1 + + if refFasta.references[0].startswith('chr'): + if target_chr.startswith('chr'): + fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() + else: + fields[3] = refFasta.fetch('chr'+target_chr,target_start,target_end).upper() + else: + if target_chr.startswith('chr'): + fields[3] = refFasta.fetch(target_chr.replace('chr',''),target_start,target_end).upper() + else: + fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() + + + + if fields[3] != fields[4]: + print('\t'.join(map(str, fields)), file=FILE_OUT) + else: + print(line, file=UNMAP) + fail += 1 + else: + print(line, file=UNMAP) + fail += 1 + continue + FILE_OUT.close() + UNMAP.close() + printlog (["Total entries:", str(total)]) + printlog (["Failed to map:", str(fail)]) + + +def crossmap_bed_file(mapping, inbed,outfile=None): + ''' + Convert genome coordinates (in bed format) between assemblies. + BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 + ''' + + # check if 'outfile' was set. If not set, print to screen, if set, print to file + if outfile is not None: + FILE_OUT = open(outfile,'w') + UNMAP = open(outfile + '.unmap','w') + else: + pass + + for line in ireader.reader(inbed): + if line.startswith('#'):continue + if line.startswith('track'):continue + if line.startswith('browser'):continue + if not line.strip():continue + line=line.strip() + fields=line.split() + strand = '+' + + # filter out line less than 3 columns + if len(fields)<3: + print("Less than 3 fields. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + try: + int(fields[1]) + except: + print("Start corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + try: + int(fields[2]) + except: + print("End corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + if int(fields[1]) > int(fields[2]): + print("\"Start\" is larger than \"End\" corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + + # deal with bed less than 12 columns + if len(fields)<12: + + # try to reset strand + try: + for f in fields: + if f in ['+','-']: + strand = f + except: + pass + + a = map_coordinates(mapping, fields[0], int(fields[1]),int(fields[2]),strand) + # example of a: [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] + + try: + if len(a) % 2 != 0: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + continue + if len(a) == 2: + + #reset fields + fields[0] = a[1][0] + fields[1] = a[1][1] + fields[2] = a[1][2] + for i in range(0,len(fields)): #update the strand information + if fields[i] in ['+','-']: + fields[i] = a[1][3] + + if outfile is None: + print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + if len(a) >2 : + count=0 + for j in range(1,len(a),2): + count += 1 + fields[0] = a[j][0] + fields[1] = a[j][1] + fields[2] = a[j][2] + for i in range(0,len(fields)): #update the strand information + if fields[i] in ['+','-']: + fields[i] = a[j][3] + + if outfile is None: + print(line + '\t'+ '(split.' + str(count) + ':' + ':'.join([str(i) for i in a[j-1]]) + ')\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + except: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + continue + + # deal with bed12 and bed12+8 (genePred format) + if len(fields)==12 or len(fields)==20: + strand = fields[5] + if strand not in ['+','-']: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % strand) + fail_flag=False + exons_old_pos = annoGene.getExonFromLine(line) #[[chr,st,end],[chr,st,end],...] + #print exons_old_pos + exons_new_pos = [] + for e_chr, e_start, e_end in exons_old_pos: + a = map_coordinates(mapping, e_chr, e_start, e_end, strand) + if a is None: + fail_flag =True + break + + if len(a) == 2: + exons_new_pos.append(a[1]) + # a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] + else: + fail_flag =True + break + + if not fail_flag: + # check if all exons were mapped to the same chromosome the same strand + chr_id = set() + exon_strand = set() + + for e_chr, e_start, e_end, e_strand in exons_new_pos: + chr_id.add(e_chr) + exon_strand.add(e_strand) + if len(chr_id) != 1 or len(exon_strand) != 1: + fail_flag = True + + if not fail_flag: + # build new bed + cds_start_offset = int(fields[6]) - int(fields[1]) + cds_end_offset = int(fields[2]) - int(fields[7]) + new_chrom = exons_new_pos[0][0] + new_chrom_st = exons_new_pos[0][1] + new_chrom_end = exons_new_pos[-1][2] + new_name = fields[3] + new_score = fields[4] + new_strand = exons_new_pos[0][3] + new_thickStart = new_chrom_st + cds_start_offset + new_thickEnd = new_chrom_end - cds_end_offset + new_ittemRgb = fields[8] + new_blockCount = len(exons_new_pos) + new_blockSizes = ','.join([str(o - n) for m,n,o,p in exons_new_pos]) + new_blockStarts = ','.join([str(n - new_chrom_st) for m,n,o,p in exons_new_pos]) + + new_bedline = '\t'.join(str(i) for i in (new_chrom,new_chrom_st,new_chrom_end,new_name,new_score,new_strand,new_thickStart,new_thickEnd,new_ittemRgb,new_blockCount,new_blockSizes,new_blockStarts)) + if check_bed12(new_bedline) is False: + fail_flag = True + else: + if outfile is None: + print(line + '\t->\t' + new_bedline) + else: + print(new_bedline, file=FILE_OUT) + + if fail_flag: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + +def crossmap_gff_file(mapping, ingff,outfile=None): + ''' + Convert genome coordinates (in GFF/GTF format) between assemblies. + GFF (General Feature Format) lines have nine required fields that must be Tab-separated: + + 1. seqname - The name of the sequence. Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types + are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 + for this annotation data set, the score value will determine the level of gray in + which this feature is displayed (higher numbers = darker gray). If there is no score + value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that + represents the reading frame of the first base. If the feature is not a coding exon, + the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + + GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 + + GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The + first eight GTF fields are the same as GFF. The group field has been expanded into a + list of attributes. Each attribute consists of a type/value pair. Attributes must end + in a semi-colon, and be separated from any following attribute by exactly one space. + + GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4 + + We do NOT check if features (exon, CDS, etc) belonging to the same gene originally were + converted into the same chromosome/strand. + ''' + + if outfile is not None: + FILE_OUT = open(outfile,'w') + UNMAP = open(outfile + '.unmap', 'w') + + for line in ireader.reader(ingff): + if line.startswith('#'):continue + if line.startswith('track'):continue + if line.startswith('browser'):continue + if line.startswith('visibility'):continue + if not line.strip():continue + + line=line.strip() + fields=line.split('\t') + # check GFF file + #if len(fields) != 9: + # print >>sys.stderr, 'GFF file must has 9 columns. Skip ' + line + # continue + try: + start = int(fields[3]) - 1 #0-based + end = int(fields[4])/1 + feature_size = end - start + except: + print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + if fields[6] not in ['+','-','.']: + print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + + strand = '-' if fields[6] == '-' else '+' + + chrom = fields[0] + a = map_coordinates(mapping, chrom,start,end,strand) + if a is None: + if outfile is None: + print(line + '\tfail (no match to target assembly)') + else: + print(line, file=UNMAP) + continue + if len(a) !=2: + if outfile is None: + print(line + '\tfail (multpile match to target assembly)') + else: + print(line, file=UNMAP) + else: + if (int(a[1][2]) - int(a[1][1])) != feature_size: # check if it is exact match + if outfile is None: + print(line + '\tfail (not exact match)') + else: + print(line, file=UNMAP) + fields[0] = a[1][0] # chrom + fields[3] = a[1][1] + 1 # start, 1-based + fields[4] = a[1][2] + fields[6] = a[1][3] + + if outfile is None: + print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + +def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3, addtag = True): + ''' + Convert genome coordinates (in BAM/SAM format) between assemblies. + BAM/SAM format: http://samtools.sourceforge.net/ + chrom_size is target chromosome size + + if addtag is set to True, will add tags to each alignmnet: + Q = QC (QC failed) + N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) + M = multiple mapped (alignment can be liftover to multiple places) + U = unique mapped (alignment can be liftover to only 1 place) + + tags for pair-end sequencing include: + + QF: QC failed + + NN: both read1 and read2 unmapped + NU: read1 unmapped, read2 unique mapped + NM: read1 unmapped, multiple mapped + + UN: read1 uniquely mapped, read2 unmap + UU: both read1 and read2 uniquely mapped + UM: read1 uniquely mapped, read2 multiple mapped + + MN: read1 multiple mapped, read2 unmapped + MU: read1 multiple mapped, read2 unique mapped + MM: both read1 and read2 multiple mapped + + tags for single-end sequencing include: + + QF: QC failed + SN: unmaped + SM: multiple mapped + SU: uniquely mapped + + ''' + + # determine the input file format (BAM or SAM) + try: + samfile = pysam.Samfile(infile,'rb') + if len(samfile.header) == 0: + print("BAM file has no header section. Exit!", file=sys.stderr) + sys.exit(1) + bam_format = True + except: + samfile = pysam.Samfile(infile,'r') + if len(samfile.header) == 0: + print("SAM file has no header section. Exit!", file=sys.stderr) + sys.exit(1) + bam_format = False + + # add comments into BAM/SAM header section + if bam_format: + comments=['Liftover from original BAM file: ' + infile] + else: + comments=['Liftover from original SAM file: ' + infile] + comments.append('Liftover is based on the chain file: ' + chainfile) + + sam_ori_header = samfile.header + (new_header, name_to_id) = sam_header.bam_header_generator(orig_header = sam_ori_header, chrom_size = chrom_size, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=comments) + + # write to file + if outfile_prefix is not None: + if bam_format: + OUT_FILE = pysam.Samfile( outfile_prefix + '.bam', "wb", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.bam', "wb", template=samfile ) + printlog (["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam']) + else: + OUT_FILE = pysam.Samfile( outfile_prefix + '.sam', "wh", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.sam', "wh", template=samfile ) + printlog (["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam']) + # write to screen + else: + if bam_format: + OUT_FILE = pysam.Samfile( '-', "wb", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.bam','') + '.unmap.bam', "wb", template=samfile ) + printlog (["Liftover BAM file:", infile]) + else: + OUT_FILE = pysam.Samfile( '-', "w", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.sam','') + '.unmap.sam', "wh", template=samfile ) + printlog (["Liftover SAM file:", infile]) + + QF = 0 + NN = 0 + NU = 0 + NM = 0 + UN = 0 + UU = 0 + UM = 0 + MN = 0 + MU = 0 + MM = 0 + SN = 0 + SM = 0 + SU = 0 + total_item = 0 + try: + while(1): + total_item += 1 + new_alignment = pysam.AlignedRead() # create AlignedRead object + old_alignment = next(samfile) + + # qcfailed reads will be written to OUT_FILE_UNMAP for both SE and PE reads + if old_alignment.is_qcfail: + QF += 1 + if addtag: old_alignment.set_tag(tag="QF", value=0) + OUT_FILE.write(old_alignment) + continue + # new_alignment.qqual = old_alignment.qqual # quality string of read, exclude soft clipped part + # new_alignment.qlen = old_alignment.qlen # length of the "aligned part of read" + # new_alignment. aligned_pairs = old_alignment. aligned_pairs #read coordinates <=> ref coordinates + # new_alignment_aend = old_alignment.aend # reference End position of the aligned part (of read) + + #new_alignment.qname = old_alignment.qname # 1st column. read name. + #new_alignment.flag = old_alignment.flag # 2nd column. subject to change. flag value + #new_alignment.tid = old_alignment.tid # 3rd column. samfile.getrname(tid) == chrom name + #new_alignment.pos = old_alignment.pos # 4th column. reference Start position of the aligned part (of read) [0-based] + #new_alignment.mapq = old_alignment.mapq # 5th column. mapping quality + #new_alignment.cigar= old_alignment.cigar # 6th column. subject to change. + #new_alignment.rnext = old_alignment.rnext # 7th column. tid of the reference (mate read mapped to) + #new_alignment.pnext = old_alignment.pnext # 8th column. position of the reference (0 based, mate read mapped to) + #new_alignment.tlen = old_alignment.tlen # 9th column. insert size + #new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases. + #new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases. + #new_alignment.tags = old_alignment.tags # 12 - columns + + new_alignment.qname = old_alignment.qname # 1st column. read name. + new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases. + new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases. + new_alignment.tags = old_alignment.tags # 12 - columns + + + # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes + # Thanks Wolfgang Resch identified this bug and provided solution. + + try: + rg, rgt = old_alignment.get_tag("RG", with_value_type=True) + except KeyError: + pass + else: + new_alignment.set_tag("RG", str(rg), rgt) + + if old_alignment.is_paired: + new_alignment.flag = 0x0001 #pair-end + if old_alignment.is_read1: + new_alignment.flag = new_alignment.flag | 0x40 + elif old_alignment.is_read2: + new_alignment.flag = new_alignment.flag | 0x80 + + #================================== + # R1 is originally unmapped + #================================== + if old_alignment.is_unmapped: + #This line will set the unmapped read flag for the first read in pair (in case it is unmapped). + #Thanks Amir Keivan Mohtashami reporting this bug. + new_alignment.flag = new_alignment.flag | 0x4 + + #------------------------------------ + # both R1 and R2 unmapped + #------------------------------------ + if old_alignment.mate_is_unmapped: + NN += 1 + if addtag: old_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(old_alignment) + continue + else: # originally, read-1 unmapped, read-2 is mapped + try: + read2_chr = samfile.getrname(old_alignment.rnext) + read2_strand = '-' if old_alignment.mate_is_reverse else '+' + read2_start = old_alignment.pnext + read2_end = read2_start + 1 + read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand) + # [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )] + except: + read2_maps = None + + #------------------------------------ + # both R1 and R2 unmapped + #------------------------------------ + if read2_maps is None: + NN += 1 + if addtag: old_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(old_alignment) + continue + + #------------------------------------ + # R1 unmapped, R2 unique + #------------------------------------ + elif len(read2_maps) == 2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's + # 4 + new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NU += 1 + if addtag: new_alignment.set_tag(tag="NU", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 unmapped, R2 multiple + #------------------------------------ + else: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] + # 4 + new_alignment.pos = read2_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NM += 1 + if addtag: new_alignment.set_tag(tag="NM", value=0) + OUT_FILE.write(new_alignment) + continue + + #================================== + # R1 is originally mapped + #================================== + else: + try: + read1_chr = samfile.getrname(old_alignment.tid) + read1_strand = '-' if old_alignment.is_reverse else '+' + read1_start = old_alignment.pos + read1_end = old_alignment.aend + read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) + except: + read1_maps = None + + if not old_alignment.mate_is_unmapped: + try: + read2_chr = samfile.getrname(old_alignment.rnext) + read2_strand = '-' if old_alignment.mate_is_reverse else '+' + read2_start = old_alignment.pnext + read2_end = read2_start + 1 + read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand) + # [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )] + except: + read2_maps = None + + + #------------------------------------ + # R1 unmapped (failed to liftover) + #------------------------------------ + if read1_maps is None: + # 2 update flag (0x4: segment unmapped) + new_alignment.flag = new_alignment.flag | 0x4 + # 3 + new_alignment.tid = -1 + # 4 + new_alignment.pos = 0 + # 5 + new_alignment.mapq = 255 + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + + # (1) read2 is unmapped before conversion + if old_alignment.mate_is_unmapped: + NN += 1 + if addtag: new_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) read2 is unmapped after conversion + elif read2_maps is None: + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + + NN += 1 + if addtag: new_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) read2 is unique mapped + elif len(read2_maps) == 2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's + # 4 + new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NU += 1 + if addtag: new_alignment.set_tag(tag="NU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (4) read2 is multiple mapped + else: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] + # 4 + new_alignment.pos = read2_maps[1][1] + # 5 + new_alignment.mapq = 255 # mapq not available + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NM += 1 + if addtag:new_alignment.set_tag(tag="NM", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 uniquely mapped + #------------------------------------ + elif len(read1_maps) == 2: + + if read1_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + + if read1_maps[0][3] != read1_maps[1][3]: # opposite strand + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + elif read1_maps[0][3] == read1_maps[1][3]: # same strand + # 6 + new_alignment.cigar = old_alignment.cigar + + # 3 + new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom + # 4 + new_alignment.pos = read1_maps[1][1] #start + # 5 + new_alignment.mapq = old_alignment.mapq + + # (1) R2 unmapped before or after conversion + if (old_alignment.mate_is_unmapped) or (read2_maps is None): + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + # 7 + new_alignment.rnext = name_to_id[read1_maps[1][0]] + # 8 + new_alignment.pnext = read1_maps[1][1] + # 9 + new_alignment.tlen = 0 + + UN += 1 + if addtag: new_alignment.set_tag(tag="UN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) R2 is unique mapped + elif len(read2_maps)==2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + # 2 + if (read2_maps[1][3] != read1_maps[1][3]) and (new_alignment.tlen <= IS_size + fold * IS_std) and (new_alignment.tlen >= IS_size - fold * IS_std): + new_alignment.flag = new_alignment.flag | 0x2 + + UU += 1 + if addtag: new_alignment.set_tag(tag="UU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) R2 is multiple mapped + else: + # 2 (strand) + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 (secondary alignment) + new_alignment.flag = new_alignment.flag | 0x100 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + UM += 1 + if addtag: new_alignment.set_tag(tag="UM", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 multiple mapped + #----------------------------------- + elif len(read1_maps) > 2 and len(read1_maps) % 2 ==0: + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 2 + if read1_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + + if read1_maps[0][3] != read1_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + elif read1_maps[0][3] == read1_maps[1][3]: + new_alignment.cigar = old_alignment.cigar + # 3 + new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom + # 4 + new_alignment.pos = read1_maps[1][1] #start + # 5 + new_alignment.mapq = 255 + + + # (1) R2 is unmapped + if (old_alignment.mate_is_unmapped) or (read2_maps is None): + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + # 7 + new_alignment.rnext = name_to_id[read1_maps[1][0]] + # 8 + new_alignment.pnext = read1_maps[1][1] + # 9 + new_alignment.tlen = 0 + + MN += 1 + if addtag: new_alignment.set_tag(tag="MN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) read2 is unique mapped + elif len(read2_maps)==2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + MU += 1 + if addtag: new_alignment.set_tag(tag="MU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) R2 is multiple mapped + else: + # 2 (strand) + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 (secondary alignment) + new_alignment.flag = new_alignment.flag | 0x100 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + MM += 1 + if addtag: new_alignment.set_tag(tag="MM", value=0) + OUT_FILE.write(new_alignment) + continue + else: + #old_alignment.tid = name_to_id[samfile.getrname(old_alignment.tid)] + OUT_FILE_UNMAP.write(old_alignment) + failed += 1 + + # Singel end sequencing + else: + # (1) originally unmapped + if old_alignment.is_unmapped: + SN += 1 + if addtag: old_alignment.set_tag(tag="SN",value=0) + OUT_FILE.write(old_alignment) + continue + else: + new_alignment.flag = 0x0 + read_chr = samfile.getrname(old_alignment.tid) + read_strand = '-' if old_alignment.is_reverse else '+' + read_start = old_alignment.pos + read_end = old_alignment.aend + read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) + + # (2) unmapped afte liftover + if read_maps is None: + # 1 + new_alignment.qname = old_alignment.qname + # 2 + new_alignment.flag = new_alignment.flag | 0x4 + # 3 + new_alignment.tid = -1 + # 4 + new_alignment.pos = 0 + # 5 + new_alignment.mapq = 255 + # 6 + new_alignment.cigar= old_alignment.cigar + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + # 12 + new_alignment.tags = old_alignment.tags + + SN += 1 + if addtag: new_alignment.set_tag(tag="SN",value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) unique mapped + if len(read_maps)==2: + # 1 + new_alignment.qname = old_alignment.qname + # 2 + if read_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + if read_maps[0][3] != read_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + else: + # 6 + new_alignment.cigar = old_alignment.cigar + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + + # 3 + new_alignment.tid = name_to_id[read_maps[1][0]] + # 4 + new_alignment.pos = read_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + SU += 1 + if addtag: new_alignment.set_tag(tag="SU",value=0) + OUT_FILE.write(new_alignment) + continue + + # (4) multiple mapped + if len(read_maps) > 2 and len(read_maps) % 2 ==0: + + # 1 + new_alignment.qname = old_alignment.qname + new_alignment.flag = new_alignment.flag | 0x100 + # 2 + if read_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + if read_maps[0][3] != read_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + else: + # 6 + new_alignment.cigar = old_alignment.cigar + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + + # 3 + new_alignment.tid = name_to_id[read_maps[1][0]] + # 4 + new_alignment.pos = read_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + SM += 1 + if addtag: new_alignment.set_tag(tag="SM",value=0) + OUT_FILE.write(new_alignment) + continue + except StopIteration: + printlog(["Done!"]) + OUT_FILE.close() + + if outfile_prefix is not None: + if bam_format: + try: + printlog (['Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')]) + pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') + except: + printlog(["Warning: ","output BAM file was NOT sorted"]) + try: + printlog (['Index "%s" ...' % (outfile_prefix + '.sorted.bam')]) + pysam.index(outfile_prefix + '.sorted.bam',outfile_prefix + '.sorted.bam.bai') + except: + printlog(["Warning: ","output BAM file was NOT indexed."]) + + print("Total alignments:" + str(total_item-1)) + print("\tQC failed: " + str(QF)) + if max(NN,NU, NM, UN, UU, UM, MN, MU, MM) > 0: + + print("\tR1 unique, R2 unique (UU): " + str(UU)) + print("\tR1 unique, R2 unmapp (UN): " + str(UN)) + print("\tR1 unique, R2 multiple (UM): " + str(UM)) + + print("\tR1 multiple, R2 multiple (MM): " + str(MM)) + print("\tR1 multiple, R2 unique (MU): " + str(MU)) + print("\tR1 multiple, R2 unmapped (MN): " + str(MN)) + + print("\tR1 unmap, R2 unmap (NN): " + str(NN)) + print("\tR1 unmap, R2 unique (NU): " + str(NU)) + print("\tR1 unmap, R2 multiple (NM): " + str(NM)) + if max(SN,SU,SM) > 0: + print("\tUniquley mapped (SU): " + str(SU)) + print("\tMultiple mapped (SM): " + str(SM)) + print("\tUnmapped (SN): " + str(SN)) + + +def crossmap_wig_file(mapping, in_file, out_prefix, taget_chrom_size, in_format, binSize=100000): + ''' + Convert genome coordinates (in wiggle/bigwig format) between assemblies. + wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html + bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html + ''' + + OUT_FILE1 = open(out_prefix + '.bgr','w') # original bgr file + OUT_FILE2 = open(out_prefix + '.sorted.bgr','w') # sorted bgr file + OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w") # bigwig file + + if in_format.upper() == "WIGGLE": + printlog (["Liftover wiggle file:", in_file, '==>', out_prefix + '.bgr']) + + for chr, start, end, strand, score in wiggleReader (in_file): + #print chr,start,end,score + # map_coordinates(mapping, q_chr, q_start, q_end, q_strand = '+', print_match=False): + maps = map_coordinates(mapping, chr, start, end, '+') + if maps is None: + continue + if len(maps) == 2: + print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1) + else: + continue + maps[:]=[] + OUT_FILE1.close() + + printlog (["Merging overlapped entries in bedGraph file ..."]) + for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): + print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2) + OUT_FILE2.close() + + os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr + + # add header to bigwig file + printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ]) + target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())] + OUT_FILE3.addHeader(target_chroms_sorted) + + printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ]) + # add entries to bigwig file + for line in ireader.reader(out_prefix + '.sorted.bgr'): + r_chr,r_st,r_end,r_value = line.split() + OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) + + OUT_FILE3.close() + + elif in_format.upper() == "BIGWIG": + + printlog (["Liftover bigwig file:", in_file, '==>', out_prefix + '.bgr']) + for chr, start, end, score in bigwigReader(in_file, bin_size = binSize ): + maps = map_coordinates(mapping, chr, start, end, '+') + try: + if maps is None: continue + if len(maps) == 2: + print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1) + else: + continue + except: + continue + maps[:]=[] + OUT_FILE1.close() + + printlog (["Merging overlapped entries in bedGraph file ..."]) + for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): + print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2) + OUT_FILE2.close() + os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr + + # add header to bigwig file + printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ]) + target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())] + OUT_FILE3.addHeader(target_chroms_sorted) + + # add entries to bigwig file + printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ]) + for line in ireader.reader(out_prefix + '.sorted.bgr'): + r_chr,r_st,r_end,r_value = line.split() + OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) + + OUT_FILE3.close() + else: + raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'") + + +def general_help(): + desc="""CrossMap is a program for convenient conversion of genome coordinates and genome \ +annotation files between assemblies (eg. lift from human hg18 to hg19 or vice versa). \ +It supports file in BAM, SAM, BED, Wiggle, BigWig, GFF, GTF and VCF format.""" + + print("Program: %s (v%s)" % ("CrossMap", __version__), file=sys.stderr) + print("\nDescription: \n%s" % '\n'.join(' '+i for i in wrap(desc,width=80)), file=sys.stderr) + print("\nUsage: CrossMap.py [options]\n", file=sys.stderr) + for k in sorted(commands): + print(' ' + k + '\t' + commands[k], file=sys.stderr) + print(file=sys.stderr) + +def bed_help(): + msg =[ + ('Usage:', "CrossMap.py bed input_chain_file input_bed_file [output_file]"), + ('Description:', "\"input_chain_file\" and \"input_bed_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. BED format file must have at least 3 columns (chrom, start, end) and no more than 12 columns. If no \"output_file\" is specified, output will be directed to the screen (console). BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1"), + ('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed test.hg19.bed # write output to \"test.hg19.bed\""), + ('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed # write output to screen"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def gff_help(): + msg =[ + ('Usage:', "CrossMap.py gff input_chain_file input_gff_file output_file"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Input file must be in GFF or GTF format. GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4"), + ('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf test.hg18.gtf # write output to \"test.hg18.gff\""), + ('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf # write output to screen"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def wig_help(): + msg =[ + ('Usage:', "CrossMap.py wig input_chain_file input_wig_file output_prefix"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Both \"variableStep\" and \"fixedStep\" wiggle lines are supported. Wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html"), + ('Example:', "CrossMapy.py wig hg18ToHg19.over.chain.gz test.hg18.wig test.hg19"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) +def bigwig_help(): + msg =[ + ('Usage:', "CrossMap.py bigwig input_chain_file input__bigwig_file output_prefix"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html"), + ('Example:', "CrossMapy.py bigwig hg18ToHg19.over.chain.gz test.hg18.bw test.hg19"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def bam_help(): + usage="CrossMap.py bam -a input_chain_file input_bam_file output_file [options] " + import optparse + parser.print_help() + +def vcf_help(): + msg =[ + ("usage:","CrossMap.py vcf input_chain_file input_VCF_file ref_genome_file output_file"), + ("Description:", "\"input_chain_file\" and \"input_VCF_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. \"ref_genome_file\" is genome sequence file of 'target assembly' in FASTA format."), + ("Example:", " CrossMap.py vcf hg19ToHg18.over.chain.gz test.hg19.vcf hg18.fa test.hg18.vcf"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + + +if __name__=='__main__': + + # test read_chain_file() + #(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(sys.argv[1], print_table=True) + + #q_chr, q_start, q_end, q_strand = '+', print_match=False): + #map_coordinates(mapTree, 'chr1', 45953452, 45953456, '+', print_match=True) + + + #a= sam_header.bam_header_generator(chrom_size = targetChromSizes, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=['a','b','c']) + #crossmap_bed_file(mapTree, 'test.hg18.bed3',outfile='aa') + #crossmap_bed_file(mapTree, 'test.bed') + #crossmap_gff_file(mapTree,'test.hg18.gtf',outfile = 'aa') + + + #crossmap_bam_file(mapTree, targetChromSizes, sys.argv[1], 'eg.abnormal.sam','out') + #printlog (['Sort output bam file ...']) + #pysam.sort('-m','1000000000','out.bam','out.sorted') + #printlog (['Index bam file ...']) + #pysam.index('out.sorted.bam','out.sorted.bam.bai') + + #crossmap_wig_file(mapTree, 'GSM686950_reverse.hg18.bw', 'GSM686950_reverse.hg19', sourceChromSizes, targetChromSizes, in_format = 'bigwig') + + #parser = optparse.OptionParser() + #parser.add_option("-i","--input-sam",action="store",type="string",dest="input_sam_file", help="BAM/SAM format file.") + #parser.add_option("-o","--out-file",action="store",type="string",dest="output_sam_file", help="Prefix of output files. Foramt is determined from input: SAM <=> SAM, BAM <=> BAM") + #parser.add_option("-m","--insert-mean",action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). Used to tell wheather a mapped pair is \"proper pair\" or not. [default=%default]") + #parser.add_option("-s","--insert-stdev",action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" ) + #parser.add_option("-t","--times-of-stdev",action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]") + #(options,args)=parser.parse_args() + + #print options + #print args + + + commands = { + 'bed':'convert genome cooridnate or annotation file in BED, bedGraph or other BED-like format.', + 'bam':'convert alignment file in BAM or SAM format.', + 'gff':'convert genome cooridnate or annotation file in GFF or GTF format.', + 'wig':'convert genome coordinate file in Wiggle, or bedGraph format.', + 'bigwig':'convert genome coordinate file in BigWig format.', + 'vcf':'convert genome coordinate file in VCF format.' + } + kwds = list(commands.keys()) + + + if len(sys.argv) == 1: + general_help() + sys.exit(0) + elif len(sys.argv) >=2: + # deal with bed input + if sys.argv[1].lower() == 'bed': + if len(sys.argv) == 4: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = None + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file, print_table = False) + crossmap_bed_file(mapTree, in_file, out_file) + + elif len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_bed_file(mapTree, in_file, out_file) + else: + bed_help() + sys.exit(0) + elif sys.argv[1].lower() == 'gff': + if len(sys.argv) == 4: + chain_file = sys.argv[2] + in_file = sys.argv[3] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_gff_file(mapTree, in_file, None) + elif len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_gff_file(mapTree, in_file, out_file) + else: + gff_help() + sys.exit(0) + elif sys.argv[1].lower() == 'wig': + if len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'wiggle') + else: + wig_help() + sys.exit(0) + elif sys.argv[1].lower() == 'bigwig': + if len(sys.argv) == 5: + chain_file = sys.argv[2] + + in_file = sys.argv[3] + try: + bw = pyBigWig.open(in_file) + except: + print ("\nPlease check if \"%s\" is in bigWig format!\n" % in_file, file=sys.stderr) + sys.exit(0) + + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'bigwig') + else: + bigwig_help() + sys.exit(0) + elif sys.argv[1].lower() == 'bam': + #def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3): + usage="CrossMap.py bam input_chain_file input_bam_file output_file [options]\nNote: If output_file == STDOUT or -, CrossMap will write BAM file to the screen" + parser = optparse.OptionParser(usage, add_help_option=False) + parser.add_option("-m", "--mean", action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). [default=%default]") + parser.add_option("-s", "--stdev", action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" ) + parser.add_option("-t", "--times", action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]") + parser.add_option("-a","--append-tags",action="store_true",dest="add_tags",help="Add tag to each alignment.") + (options,args)=parser.parse_args() + + if len(args) >= 3: + print("Insert size = %f" % (options.insert_size), file=sys.stderr) + print("Insert size stdev = %f" % (options.insert_size_stdev), file=sys.stderr) + print("Number of stdev from the mean = %f" % (options.insert_size_fold), file=sys.stderr) + if options.add_tags: + print("Add tags to each alignment = %s" % ( options.add_tags), file=sys.stderr) + else: + print("Add tags to each alignment = %s" % ( False), file=sys.stderr) + chain_file = args[1] + in_file = args[2] + out_file = args[3] if len(args) >= 4 else None + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + + if out_file in ["STDOUT","-"]: + out_file = None + if options.add_tags: + crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=True) + else: + crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=False) + else: + parser.print_help() + elif sys.argv[1].lower() == 'vcf': + if len(sys.argv) == 6: + chain_file = sys.argv[2] + in_file = sys.argv[3] + genome_file = sys.argv[4] + out_file = sys.argv[5] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_vcf_file(mapping = mapTree, infile= in_file, outfile = out_file, liftoverfile = sys.argv[2], refgenome = genome_file) + else: + vcf_help() + sys.exit(0) + else: + general_help() + sys.exit(0) + + + + + diff --git a/bin/Modules/CrossMapper.py b/bin/Modules/CrossMapper.py new file mode 100644 index 0000000..74ae97e --- /dev/null +++ b/bin/Modules/CrossMapper.py @@ -0,0 +1,40 @@ +import urllib.request +import os +from Modules import CrossMap + + +class CrossMapper: + def __init__(self, org, wd, out, is_dir): + self.org = org + if is_dir: + self.infile = os.path.join( wd + "/temp/" + org + ".gtf") + else: + self.infile = os.path.join(wd+"/data/temp/"+org+".gtf") + self.outfile = os.path.join(out+"/" + org + "_mapped.gtf") + self.chainfile = self.get_chain_file(org, wd, is_dir) + (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile) + CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile) + + def get_chain_file(self, org, wd, isdir): + + # Defines the Chain files for different conversions. + + if org == "hg19": + if isdir: + file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz") + else: + file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) + urllib.request.urlretrieve("http://hgdownload.soe.ucsc.edu/goldenPath/hg38/" + "liftOver/hg38ToHg19.over.chain.gz", + file_link) + return file_link + + elif org == "mm9": + if isdir: + file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz") + else: + file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) + urllib.request.urlretrieve("http://hgdownload.soe.ucsc.edu/goldenPath/mm10/" + "liftOver/mm10ToMm9.over.chain.gz", + file_link) + return file_link diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index e13aa6a..5425bd3 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -4,14 +4,19 @@ class ResultSaver: - def __init__(self, results, organism, tissue): + def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): print("Save results to File !") self.path = "" - if tissue: - self.path = os.path.join("./"+organism+"_filtered.gtf") + if mapped: + if is_data_dir: + self.path = os.path.join(wd + "temp/" + organism + ".gtf") + else: + self.path = os.path.join( wd + "data/temp/" + organism + ".gtf" ) + elif tissue: + self.path = os.path.join(out+"/"+organism+"_filtered.gtf") else: - self.path = os.path.join("./"+organism+".gtf") + self.path = os.path.join(out+"/"+organism+".gtf") with open(self.path, "w") as file: write_it = csv.writer(file, delimiter='\t') diff --git a/bin/Modules/ucsc/bigBedToBed b/bin/Modules/ucsc/bigBedToBed old mode 100644 new mode 100755 diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py index 6f0e11c..131b853 100755 --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -1,23 +1,37 @@ -#!/usr/bin/env python3 - import argparse from Modules.Ensembl.Ensembl import Ensembl from Modules.ucsc.ucsc import UcscGtf from Modules.Uniquifier import UniqueFilter from Modules.SaveResults import ResultSaver +from Modules.CrossMapper import CrossMapper import os import json def check_for_local_folder(wd): - if not os.path.isdir(os.path.join(wd+"/EnsemblData")): + if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")): os.mkdir(os.path.join(wd+"/data/EnsemblData")) if not os.path.isdir(os.path.join(wd+"/data/UCSCData")): os.mkdir(os.path.join(wd+"/data/UCSCData")) + if not os.path.isdir(os.path.join(wd+"/data/temp")): + os.mkdir(os.path.join(wd+"/data/temp")) + + +def check_for_data_dir(data_dir): + + if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")): + os.mkdir(os.path.join(data_dir+"/EnsemblData")) + + if not os.path.isdir(os.path.join(data_dir+"/UCSCData")): + os.mkdir(os.path.join(data_dir + "/UCSCData")) + + if not os.path.isdir(os.path.join(data_dir+"/temp")): + os.mkdir(os.path.join(data_dir+"/temp")) + def check_filter(tissue_cmd, org, wd): path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") @@ -36,9 +50,29 @@ def check_filter(tissue_cmd, org, wd): return False -def main_script(org, wd, data_dir, tissuetype=None): +def cross_map(data_dir, org): + pass + + +def check_organism(org): + if org == "hg38": + return "homo_sapiens", False + if org == "hg19": + print("Older assembly Version detected: hg19 -> Crossmapping result from hg38") + return "homo_sapiens", True + elif org == "mm10": + return "mus_musculus", False + elif org == "mm9": + print("Older assembly Version detected: mm9 -> Crossmapping result from mm10") + return "mus_musculus", True + + +def main_script(organism, wd, data_dir, out, tissuetype=None): + (org, x_mappable) = check_organism(organism) if not data_dir: check_for_local_folder(wd) + else: + check_for_data_dir(data_dir) if check_filter(tissuetype, org, wd): tissues = tissuetype print("Filter detected !") @@ -50,19 +84,30 @@ def main_script(org, wd, data_dir, tissuetype=None): ense = Ensembl(org, wd, data_dir) print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) - ResultSaver(unique_filter.get_results(), org, tissues) + if data_dir: + ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, tissues, out) + if x_mappable: + CrossMapper(organism, data_dir, out, True) + else: + ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, tissues, out) + if x_mappable: + CrossMapper(organism, wd, out, False) if __name__ == '__main__': parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) - parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) + parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) + parser.add_argument('--out', help='Output directory: default: "."', action='store', default=".", type=str) args = vars(parser.parse_args()) if args["organism"]: - print("Working Dir: " + args["wd"]) - main_script(args["organism"], args["wd"], args["dir"], args["tissue"]) + if args["organism"] in ["hg19", "hg38", "mm9", "mm10"]: + print("Working Dir: " + args["wd"]) + main_script(args["organism"], args["wd"], args["dir"], args["out"], args["tissue"]) + else: + print("Invalid Organism: " + args["organism"] + " see -h for help") else: print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") diff --git a/masterenv.yml b/masterenv.yml index a977fe1..e0708db 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -24,4 +24,5 @@ dependencies: - biopython - pybedtools - matplotlib - - seaborn \ No newline at end of file + - seaborn + - crossmap \ No newline at end of file