diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 9f1b02e..0000000 --- a/.gitignore +++ /dev/null @@ -1,171 +0,0 @@ -# Created by .ignore support plugin (hsz.mobi) -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -/.idea/ -/EnsemblData/release-94/ -/UCSCData/hg38.bed -/UCSCData/mm10.bed -/results/homo_sapiens_filtered.gtf diff --git a/README.md b/README.md index 4cf5660..1aa01e6 100644 --- a/README.md +++ b/README.md @@ -2,27 +2,109 @@ De novo motif discovery and evaluation based on footprints identified by TOBIAS +For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) + ## Dependencies * [conda](https://conda.io/docs/user-guide/install/linux.html) * [Nextflow](https://www.nextflow.io/) * [MEME-Suite](http://meme-suite.org/doc/install.html?man_type=web) ## Installation -Start with installing all dependencies listed above. -Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -The Nextflow-script needs a conda enviroment to run. To create this enviroment you need the yml-file from the repository. +Start with installing all dependencies listed above. It is required to set the [enviroment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar). +this can be done with following commands: +``` +export PATH=[meme-suite instalation path]/libexec/meme-[meme-suite version]:$PATH +export PATH=[meme-suite instalation path]/bin:$PATH +``` + + +Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). +The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. +On some systems Nextflow exits the run with following error: +``` +Caused by: + Failed to create Conda environment + command: conda env create --prefix --file env.yml + status : 143 + message: +``` +If this error occurs you have to create the enviroment before starting the pipeline. +To create this enviroment you need the yml-file from the repository. Run the following commands to create the enviroment: ```console path=[Path to given masterenv.yml file] conda env create --name masterenv -f=$path -source activate masterenv ``` +When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. + +**Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! ## Quick Start ```console -nextflow run pipeline.nf --input [INPUT-file] --bed [INPUT-bed] --genome_fasta [path to file] --jaspar_db [path to motif database as meme-file] --config uropa.config +nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] +``` +## Parameters +For a detailed overview for all parameters follow this [link](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki/Configuration). ``` -## Parameter +Required arguments: + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') + --out Output Directory (Default: './out/') + +Optional arguments: + + --help [0|1] 1 to show this help message. (Default: 0) + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. -For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) + Footprint extraction: + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) + + Filter unknown motifs: + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) + + Clustering: + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) + + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + + Motif estimation: + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) + --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + --best_motif INT Get the best X motifs per cluster. (Default: 3) + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + + Creating GTF: + --organism [hg38 | hg19 | mm9 | mm10] Input organism + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config +All arguments can be set in the configuration files + ``` + + + +For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) diff --git a/bin/Modules/CrossMap.py b/bin/Modules/CrossMap.py new file mode 100644 index 0000000..97cfe0b --- /dev/null +++ b/bin/Modules/CrossMap.py @@ -0,0 +1,1759 @@ +#!/home/basti/miniconda3/envs/mpi-project/bin/python +''' +--------------------------------------------------------------------------------------- +CrossMap: lift over genomic coordinates between genome assemblies. +Supports BED/BedGraph, GFF/GTF, BAM/SAM, BigWig/Wig, VCF format files. +------------------------------------------------------------------------------------------ +''' + +import os,sys +import optparse +import collections +import subprocess +import string +from textwrap import wrap +from time import strftime + +import pyBigWig +import pysam +from bx.intervals import * +import numpy as np +import datetime +from cmmodule import ireader +from cmmodule import BED +from cmmodule import annoGene +from cmmodule import sam_header +from cmmodule import myutils +from cmmodule import bgrMerge + +__author__ = "Liguo Wang, Hao Zhao" +__contributor__="Liguo Wang, Hao Zhao" +__copyright__ = "Copyleft" +__credits__ = [] +__license__ = "GPLv2" +__version__="0.3.1" +__maintainer__ = "Liguo Wang" +__email__ = "wangliguo78@gmail.com" +__status__ = "Production" + +def printlog (mesg_lst): + ''' + print progress into stderr + ''' + if len(mesg_lst)==1: + msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg_lst[0] + else: + msg = "@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + ' '.join(mesg_lst) + print(msg, file=sys.stderr) + +def parse_header( line ): + return dict( [ field.split( '=' ) for field in line.split()[1:] ] ) + +def revcomp_DNA(dna): + ''' + reverse complement of input DNA sequence. + ''' + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A','N':'N','X':'X'} + seq = dna.upper() + return ''.join(complement[base] for base in reversed(seq)) + +def wiggleReader( f ): + ''' + Iterator yielding chrom, start, end, strand, value. + Values are zero-based, half-open. + Regions which lack scores are ignored. + ''' + current_chrom = None + current_pos = None + current_step = None + + # always for wiggle data + strand = '+' + + mode = "bed" + for line in ireader.reader(f): + if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ): + continue + elif line.startswith( "variableStep" ): + header = parse_header( line ) + current_chrom = header['chrom'] + current_pos = None + current_step = None + if 'span' in header: current_span = int( header['span'] ) + else: current_span = 1 + mode = "variableStep" + elif line.startswith( "fixedStep" ): + header = parse_header( line ) + current_chrom = header['chrom'] + current_pos = int( header['start'] ) - 1 + current_step = int( header['step'] ) + if 'span' in header: current_span = int( header['span'] ) + else: current_span = 1 + mode = "fixedStep" + elif mode == "bed": + fields = line.split() + if len( fields ) > 3: + if len( fields ) > 5: + yield fields[0], int( fields[1] ), int( fields[2] ), fields[5], float( fields[3] ) + else: + yield fields[0], int( fields[1] ), int( fields[2] ), strand, float( fields[3] ) + elif mode == "variableStep": + fields = line.split() + pos = int( fields[0] ) - 1 + yield current_chrom, pos, pos + current_span, strand, float( fields[1] ) + elif mode == "fixedStep": + yield current_chrom, current_pos, current_pos + current_span, strand, float( line.split()[0] ) + current_pos += current_step + else: + raise "Unexpected input line: %s" % line.strip() + +def bigwigReader(infile, bin_size = 2000): + ''' + infile: bigwig format file + chromsize: chrom_name: size. + return: chrom, position (0-based), value + ''' + + bw = pyBigWig.open(infile) + chrom_sizes = bw.chroms() + #print (chrom_sizes) + + for chr_name, chr_size in list(chrom_sizes.items()): + for chrom, st, end in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = bin_size): + try: + a = bw.stats(chrom,st,end) + except: + continue + + if bw.stats(chrom,st,end)[0] is None: + continue + sig_list = bw.values(chrom,st,end) + sig_list = np.nan_to_num(sig_list) #change 'nan' into '0.' + low_bound = st + point_num = 1 + score = sig_list[0] + for value in (sig_list[1:]): + if value == score: + point_num += 1 + else: + yield(( chrom, low_bound, low_bound + point_num, score)) + score = value + low_bound = low_bound + point_num + point_num = 1 + + +def check_bed12(bedline): + ''' + check if bed12 format is correct or not + ''' + fields = bedline.strip().split() + if len(fields) !=12: + return False + if fields[5] not in ['+','-','.']: + return False + try: + chromStart = int(fields[1]) + chromEnd = int(fields[2]) + thickStart = int(fields[6]) + thickEnd = int(fields[7]) + blockCount = int(fields[9]) + blockSizes = [int(i) for i in fields[10].rstrip(',').split(',')] + blockStarts = [int(i) for i in fields[11].rstrip(',').split(',')] + except: + return False + if chromStart > chromEnd or thickStart > thickEnd: + return False + if thickStart < chromStart or thickEnd > chromEnd: + return False + if len(blockSizes) != blockCount: + return False + if len(blockStarts) != blockCount: + return False + if blockCount <1: + return False + for i in blockSizes: + if i < 0: return False + for i in blockStarts: + if i < 0: return False + return True + +def intersectBed(xxx_todo_changeme, xxx_todo_changeme1): + ''' + return intersection of two bed regions + ''' + (chr1, st1, end1) = xxx_todo_changeme + (chr2, st2, end2) = xxx_todo_changeme1 + if int(st1) > int(end1) or int(st2) > int(end2): + raise Exception ("Start cannot be larger than end") + if chr1 != chr2: + return None + if int(st1) > int(end2) or int(end1) < int(st2): + return None + return (chr1, max(st1, st2), min(end1,end2)) + +def read_chain_file (chain_file, print_table=False): + ''' + input chain_file could be either plain text, compressed file (".gz", ".Z", ".z", ".bz", ".bz2", ".bzip2"), + or a URL pointing to the chain file ("http://", "https://", "ftp://"). If url was used, chain file must be plain text + ''' + + printlog(["Read chain_file: ", chain_file]), + maps={} + target_chromSize={} + source_chromSize={} + if print_table: + blocks=[] + + for line in ireader.reader(chain_file): + # Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1 + if not line.strip(): + continue + line=line.strip() + if line.startswith(('#',' ')):continue + fields = line.split() + + if fields[0] == 'chain' and len(fields) in [12, 13]: + score = int(fields[1]) # Alignment score + source_name = fields[2] # E.g. chrY + source_size = int(fields[3]) # Full length of the chromosome + source_strand = fields[4] # Must be + + if source_strand != '+': + raise Exception("Source strand in an .over.chain file must be +. (%s)" % line) + source_start = int(fields[5]) # Start of source region + source_end = int(fields[6]) # End of source region + + target_name = fields[7] # E.g. chr5 + target_size = int(fields[8]) # Full length of the chromosome + target_strand = fields[9] # + or - + target_start = int(fields[10]) + target_end = int(fields[11]) + target_chromSize[target_name]= target_size + source_chromSize[source_name] = source_size + + if target_strand not in ['+', '-']: + raise Exception("Target strand must be - or +. (%s)" % line) + #if target_strand == '+': + # target_start = int(fields[10]) + # target_end = int(fields[11]) + #if target_strand == '-': + # target_start = target_size - target_end + # target_end = target_size - target_start + chain_id = None if len(fields) == 12 else fields[12] + if source_name not in maps: + maps[source_name] = Intersecter() + + sfrom, tfrom = source_start, target_start + + # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to) + elif fields[0] != 'chain' and len(fields) == 3: + size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2]) + if print_table: + if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand)) + elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand)) + + if target_strand == '+': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand))) + elif target_strand == '-': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand))) + + sfrom += size + sgap + tfrom += size + tgap + + elif fields[0] != 'chain' and len(fields) == 1: + size = int(fields[0]) + if print_table: + if target_strand == '+': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, tfrom, tfrom+size, target_strand)) + elif target_strand == '-': blocks.append((source_name,sfrom, sfrom+size, source_strand, target_name, target_size - (tfrom+size), target_size - tfrom, target_strand)) + + if target_strand == '+': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,tfrom, tfrom+size,target_strand))) + elif target_strand == '-': + maps[source_name].add_interval( Interval(sfrom, sfrom+size,(target_name,target_size - (tfrom+size), target_size - tfrom, target_strand))) + else: + raise Exception("Invalid chain format. (%s)" % line) + if (sfrom + size) != source_end or (tfrom + size) != target_end: + raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header) + + if print_table: + for i in blocks: + print('\t'.join([str(n) for n in i])) + + return (maps,target_chromSize, source_chromSize) + + +def map_coordinates(mapping, q_chr, q_start, q_end, q_strand='+', print_match=False): + ''' + Map coordinates from source assembly to target assembly + ''' + + matches=[] + complement ={'+':'-','-':'+'} + if q_chr not in mapping: + return None + else: + targets = mapping[q_chr].find(q_start, q_end) + if len(targets)==0: + return None + elif len(targets)==1: + s_start = targets[0].start + s_end = targets[0].end + t_chrom = targets[0].value[0] + t_start = targets[0].value[1] + t_end = targets[0].value[2] + t_strand = targets[0].value[3] + + (chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end)) + l_offset = abs(real_start - s_start) + #r_offset = real_end - s_end + size = abs(real_end - real_start) + + matches.append( (chr, real_start, real_end,q_strand)) + if t_strand == '+': + i_start = t_start + l_offset + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + elif t_strand == '-': + i_start = t_end - l_offset - size + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + else: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand) + + elif len(targets) > 1: + for t in targets: + s_start = t.start + s_end = t.end + t_chrom = t.value[0] + t_start = t.value[1] + t_end = t.value[2] + t_strand = t.value[3] + + (chr, real_start, real_end) = intersectBed((q_chr,q_start,q_end),(q_chr,s_start,s_end)) + + l_offset = abs(real_start - s_start) + #r_offset = abs(real_end - s_end) + size = abs(real_end - real_start) + + matches.append( (chr, real_start, real_end,q_strand) ) + if t_strand == '+': + i_start = t_start + l_offset + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + elif t_strand == '-': + i_start = t_end - l_offset - size + if q_strand == '+': + matches.append( (t_chrom, i_start, i_start + size, t_strand)) + else: + matches.append( (t_chrom, i_start, i_start + size, complement[t_strand])) + else: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % q_strand) + + if print_match: + print(matches) + # input: 'chr1',246974830,247024835 + # output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')] + # [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)] + + return matches + +def crossmap_vcf_file(mapping, infile,outfile, liftoverfile, refgenome): + ''' + Convert genome coordinates in VCF format. + ''' + + #index refegenome file if it hasn't been done + if not os.path.exists(refgenome + '.fai'): + printlog(["Creating index for", refgenome]) + pysam.faidx(refgenome) + + refFasta = pysam.Fastafile(refgenome) + + FILE_OUT = open(outfile ,'w') + UNMAP = open(outfile + '.unmap','w') + + total = 0 + fail = 0 + withChr = False # check if the VCF data lines use 'chr1' or '1' + + for line in ireader.reader(infile): + if not line.strip(): + continue + line=line.strip() + + #deal with meta-information lines. + + #meta-information lines needed in both mapped and unmapped files + if line.startswith('##fileformat'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##INFO'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##FILTER'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##FORMAT'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##ALT'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##SAMPLE'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + elif line.startswith('##PEDIGREE'): + print(line, file=FILE_OUT) + print(line, file=UNMAP) + + #meta-information lines needed in unmapped files + elif line.startswith('##assembly'): + print(line, file=UNMAP) + elif line.startswith('##contig'): + print(line, file=UNMAP) + if 'ID=chr' in line: + withChr = True + + #update contig information + elif line.startswith('#CHROM'): + printlog(["Updating contig field ... "]) + target_gsize = dict(list(zip(refFasta.references, refFasta.lengths))) + for chr_id in sorted(target_gsize): + if chr_id.startswith('chr'): + if withChr is True: + print("##contig=" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + print("##contig=" % (chr_id.replace('chr',''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + if withChr is True: + print("##contig=" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + else: + print("##contig=" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) + + print("##liftOverProgram=CrossMap(https://sourceforge.net/projects/crossmap/)", file=FILE_OUT) + print("##liftOverFile=" + liftoverfile, file=FILE_OUT) + print("##new_reference_genome=" + refgenome, file=FILE_OUT) + print("##liftOverTime=" + datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) + print(line, file=FILE_OUT) + print(line, file=UNMAP) + + else: + if line.startswith('#'):continue + fields = str.split(line,maxsplit=7) + total += 1 + + if fields[0].startswith('chr'): + chrom = fields[0] + else: + chrom = 'chr' + fields[0] + + start = int(fields[1])-1 # 0 based + end = start + len(fields[3]) + a = map_coordinates(mapping, chrom, start, end,'+') + if a is None: + print(line, file=UNMAP) + fail += 1 + continue + + if len(a) == 2: + # update chrom + target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' + target_start = a[1][1] + target_end = a[1][2] + if withChr is False: + fields[0] = target_chr.replace('chr','') + else: + fields[0] = target_chr + + # update start coordinate + fields[1] = target_start + 1 + + if refFasta.references[0].startswith('chr'): + if target_chr.startswith('chr'): + fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() + else: + fields[3] = refFasta.fetch('chr'+target_chr,target_start,target_end).upper() + else: + if target_chr.startswith('chr'): + fields[3] = refFasta.fetch(target_chr.replace('chr',''),target_start,target_end).upper() + else: + fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() + + + + if fields[3] != fields[4]: + print('\t'.join(map(str, fields)), file=FILE_OUT) + else: + print(line, file=UNMAP) + fail += 1 + else: + print(line, file=UNMAP) + fail += 1 + continue + FILE_OUT.close() + UNMAP.close() + printlog (["Total entries:", str(total)]) + printlog (["Failed to map:", str(fail)]) + + +def crossmap_bed_file(mapping, inbed,outfile=None): + ''' + Convert genome coordinates (in bed format) between assemblies. + BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 + ''' + + # check if 'outfile' was set. If not set, print to screen, if set, print to file + if outfile is not None: + FILE_OUT = open(outfile,'w') + UNMAP = open(outfile + '.unmap','w') + else: + pass + + for line in ireader.reader(inbed): + if line.startswith('#'):continue + if line.startswith('track'):continue + if line.startswith('browser'):continue + if not line.strip():continue + line=line.strip() + fields=line.split() + strand = '+' + + # filter out line less than 3 columns + if len(fields)<3: + print("Less than 3 fields. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + try: + int(fields[1]) + except: + print("Start corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + try: + int(fields[2]) + except: + print("End corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + if int(fields[1]) > int(fields[2]): + print("\"Start\" is larger than \"End\" corrdinate is not an integer. skip " + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + + # deal with bed less than 12 columns + if len(fields)<12: + + # try to reset strand + try: + for f in fields: + if f in ['+','-']: + strand = f + except: + pass + + a = map_coordinates(mapping, fields[0], int(fields[1]),int(fields[2]),strand) + # example of a: [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] + + try: + if len(a) % 2 != 0: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + continue + if len(a) == 2: + + #reset fields + fields[0] = a[1][0] + fields[1] = a[1][1] + fields[2] = a[1][2] + for i in range(0,len(fields)): #update the strand information + if fields[i] in ['+','-']: + fields[i] = a[1][3] + + if outfile is None: + print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + if len(a) >2 : + count=0 + for j in range(1,len(a),2): + count += 1 + fields[0] = a[j][0] + fields[1] = a[j][1] + fields[2] = a[j][2] + for i in range(0,len(fields)): #update the strand information + if fields[i] in ['+','-']: + fields[i] = a[j][3] + + if outfile is None: + print(line + '\t'+ '(split.' + str(count) + ':' + ':'.join([str(i) for i in a[j-1]]) + ')\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + except: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + continue + + # deal with bed12 and bed12+8 (genePred format) + if len(fields)==12 or len(fields)==20: + strand = fields[5] + if strand not in ['+','-']: + raise Exception("Unknown strand: %s. Can only be '+' or '-'." % strand) + fail_flag=False + exons_old_pos = annoGene.getExonFromLine(line) #[[chr,st,end],[chr,st,end],...] + #print exons_old_pos + exons_new_pos = [] + for e_chr, e_start, e_end in exons_old_pos: + a = map_coordinates(mapping, e_chr, e_start, e_end, strand) + if a is None: + fail_flag =True + break + + if len(a) == 2: + exons_new_pos.append(a[1]) + # a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] + else: + fail_flag =True + break + + if not fail_flag: + # check if all exons were mapped to the same chromosome the same strand + chr_id = set() + exon_strand = set() + + for e_chr, e_start, e_end, e_strand in exons_new_pos: + chr_id.add(e_chr) + exon_strand.add(e_strand) + if len(chr_id) != 1 or len(exon_strand) != 1: + fail_flag = True + + if not fail_flag: + # build new bed + cds_start_offset = int(fields[6]) - int(fields[1]) + cds_end_offset = int(fields[2]) - int(fields[7]) + new_chrom = exons_new_pos[0][0] + new_chrom_st = exons_new_pos[0][1] + new_chrom_end = exons_new_pos[-1][2] + new_name = fields[3] + new_score = fields[4] + new_strand = exons_new_pos[0][3] + new_thickStart = new_chrom_st + cds_start_offset + new_thickEnd = new_chrom_end - cds_end_offset + new_ittemRgb = fields[8] + new_blockCount = len(exons_new_pos) + new_blockSizes = ','.join([str(o - n) for m,n,o,p in exons_new_pos]) + new_blockStarts = ','.join([str(n - new_chrom_st) for m,n,o,p in exons_new_pos]) + + new_bedline = '\t'.join(str(i) for i in (new_chrom,new_chrom_st,new_chrom_end,new_name,new_score,new_strand,new_thickStart,new_thickEnd,new_ittemRgb,new_blockCount,new_blockSizes,new_blockStarts)) + if check_bed12(new_bedline) is False: + fail_flag = True + else: + if outfile is None: + print(line + '\t->\t' + new_bedline) + else: + print(new_bedline, file=FILE_OUT) + + if fail_flag: + if outfile is None: + print(line + '\tFail') + else: + print(line, file=UNMAP) + +def crossmap_gff_file(mapping, ingff,outfile=None): + ''' + Convert genome coordinates (in GFF/GTF format) between assemblies. + GFF (General Feature Format) lines have nine required fields that must be Tab-separated: + + 1. seqname - The name of the sequence. Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types + are "CDS", "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 + for this annotation data set, the score value will determine the level of gray in + which this feature is displayed (higher numbers = darker gray). If there is no score + value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that + represents the reading frame of the first base. If the feature is not a coding exon, + the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + + GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 + + GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The + first eight GTF fields are the same as GFF. The group field has been expanded into a + list of attributes. Each attribute consists of a type/value pair. Attributes must end + in a semi-colon, and be separated from any following attribute by exactly one space. + + GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4 + + We do NOT check if features (exon, CDS, etc) belonging to the same gene originally were + converted into the same chromosome/strand. + ''' + + if outfile is not None: + FILE_OUT = open(outfile,'w') + UNMAP = open(outfile + '.unmap', 'w') + + for line in ireader.reader(ingff): + if line.startswith('#'):continue + if line.startswith('track'):continue + if line.startswith('browser'):continue + if line.startswith('visibility'):continue + if not line.strip():continue + + line=line.strip() + fields=line.split('\t') + # check GFF file + #if len(fields) != 9: + # print >>sys.stderr, 'GFF file must has 9 columns. Skip ' + line + # continue + try: + start = int(fields[3]) - 1 #0-based + end = int(fields[4])/1 + feature_size = end - start + except: + print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + if fields[6] not in ['+','-','.']: + print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr) + if outfile: + print(line, file=UNMAP) + continue + + strand = '-' if fields[6] == '-' else '+' + + chrom = fields[0] + a = map_coordinates(mapping, chrom,start,end,strand) + if a is None: + if outfile is None: + print(line + '\tfail (no match to target assembly)') + else: + print(line, file=UNMAP) + continue + if len(a) !=2: + if outfile is None: + print(line + '\tfail (multpile match to target assembly)') + else: + print(line, file=UNMAP) + else: + if (int(a[1][2]) - int(a[1][1])) != feature_size: # check if it is exact match + if outfile is None: + print(line + '\tfail (not exact match)') + else: + print(line, file=UNMAP) + fields[0] = a[1][0] # chrom + fields[3] = a[1][1] + 1 # start, 1-based + fields[4] = a[1][2] + fields[6] = a[1][3] + + if outfile is None: + print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) + else: + print('\t'.join([str(i) for i in fields]), file=FILE_OUT) + +def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3, addtag = True): + ''' + Convert genome coordinates (in BAM/SAM format) between assemblies. + BAM/SAM format: http://samtools.sourceforge.net/ + chrom_size is target chromosome size + + if addtag is set to True, will add tags to each alignmnet: + Q = QC (QC failed) + N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) + M = multiple mapped (alignment can be liftover to multiple places) + U = unique mapped (alignment can be liftover to only 1 place) + + tags for pair-end sequencing include: + + QF: QC failed + + NN: both read1 and read2 unmapped + NU: read1 unmapped, read2 unique mapped + NM: read1 unmapped, multiple mapped + + UN: read1 uniquely mapped, read2 unmap + UU: both read1 and read2 uniquely mapped + UM: read1 uniquely mapped, read2 multiple mapped + + MN: read1 multiple mapped, read2 unmapped + MU: read1 multiple mapped, read2 unique mapped + MM: both read1 and read2 multiple mapped + + tags for single-end sequencing include: + + QF: QC failed + SN: unmaped + SM: multiple mapped + SU: uniquely mapped + + ''' + + # determine the input file format (BAM or SAM) + try: + samfile = pysam.Samfile(infile,'rb') + if len(samfile.header) == 0: + print("BAM file has no header section. Exit!", file=sys.stderr) + sys.exit(1) + bam_format = True + except: + samfile = pysam.Samfile(infile,'r') + if len(samfile.header) == 0: + print("SAM file has no header section. Exit!", file=sys.stderr) + sys.exit(1) + bam_format = False + + # add comments into BAM/SAM header section + if bam_format: + comments=['Liftover from original BAM file: ' + infile] + else: + comments=['Liftover from original SAM file: ' + infile] + comments.append('Liftover is based on the chain file: ' + chainfile) + + sam_ori_header = samfile.header + (new_header, name_to_id) = sam_header.bam_header_generator(orig_header = sam_ori_header, chrom_size = chrom_size, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=comments) + + # write to file + if outfile_prefix is not None: + if bam_format: + OUT_FILE = pysam.Samfile( outfile_prefix + '.bam', "wb", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.bam', "wb", template=samfile ) + printlog (["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam']) + else: + OUT_FILE = pysam.Samfile( outfile_prefix + '.sam', "wh", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( outfile_prefix + '.unmap.sam', "wh", template=samfile ) + printlog (["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam']) + # write to screen + else: + if bam_format: + OUT_FILE = pysam.Samfile( '-', "wb", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.bam','') + '.unmap.bam', "wb", template=samfile ) + printlog (["Liftover BAM file:", infile]) + else: + OUT_FILE = pysam.Samfile( '-', "w", header = new_header ) + #OUT_FILE_UNMAP = pysam.Samfile( infile.replace('.sam','') + '.unmap.sam', "wh", template=samfile ) + printlog (["Liftover SAM file:", infile]) + + QF = 0 + NN = 0 + NU = 0 + NM = 0 + UN = 0 + UU = 0 + UM = 0 + MN = 0 + MU = 0 + MM = 0 + SN = 0 + SM = 0 + SU = 0 + total_item = 0 + try: + while(1): + total_item += 1 + new_alignment = pysam.AlignedRead() # create AlignedRead object + old_alignment = next(samfile) + + # qcfailed reads will be written to OUT_FILE_UNMAP for both SE and PE reads + if old_alignment.is_qcfail: + QF += 1 + if addtag: old_alignment.set_tag(tag="QF", value=0) + OUT_FILE.write(old_alignment) + continue + # new_alignment.qqual = old_alignment.qqual # quality string of read, exclude soft clipped part + # new_alignment.qlen = old_alignment.qlen # length of the "aligned part of read" + # new_alignment. aligned_pairs = old_alignment. aligned_pairs #read coordinates <=> ref coordinates + # new_alignment_aend = old_alignment.aend # reference End position of the aligned part (of read) + + #new_alignment.qname = old_alignment.qname # 1st column. read name. + #new_alignment.flag = old_alignment.flag # 2nd column. subject to change. flag value + #new_alignment.tid = old_alignment.tid # 3rd column. samfile.getrname(tid) == chrom name + #new_alignment.pos = old_alignment.pos # 4th column. reference Start position of the aligned part (of read) [0-based] + #new_alignment.mapq = old_alignment.mapq # 5th column. mapping quality + #new_alignment.cigar= old_alignment.cigar # 6th column. subject to change. + #new_alignment.rnext = old_alignment.rnext # 7th column. tid of the reference (mate read mapped to) + #new_alignment.pnext = old_alignment.pnext # 8th column. position of the reference (0 based, mate read mapped to) + #new_alignment.tlen = old_alignment.tlen # 9th column. insert size + #new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases. + #new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases. + #new_alignment.tags = old_alignment.tags # 12 - columns + + new_alignment.qname = old_alignment.qname # 1st column. read name. + new_alignment.seq = old_alignment.seq # 10th column. read sequence. all bases. + new_alignment.qual = old_alignment.qual # 11th column. read sequence quality. all bases. + new_alignment.tags = old_alignment.tags # 12 - columns + + + # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes + # Thanks Wolfgang Resch identified this bug and provided solution. + + try: + rg, rgt = old_alignment.get_tag("RG", with_value_type=True) + except KeyError: + pass + else: + new_alignment.set_tag("RG", str(rg), rgt) + + if old_alignment.is_paired: + new_alignment.flag = 0x0001 #pair-end + if old_alignment.is_read1: + new_alignment.flag = new_alignment.flag | 0x40 + elif old_alignment.is_read2: + new_alignment.flag = new_alignment.flag | 0x80 + + #================================== + # R1 is originally unmapped + #================================== + if old_alignment.is_unmapped: + #This line will set the unmapped read flag for the first read in pair (in case it is unmapped). + #Thanks Amir Keivan Mohtashami reporting this bug. + new_alignment.flag = new_alignment.flag | 0x4 + + #------------------------------------ + # both R1 and R2 unmapped + #------------------------------------ + if old_alignment.mate_is_unmapped: + NN += 1 + if addtag: old_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(old_alignment) + continue + else: # originally, read-1 unmapped, read-2 is mapped + try: + read2_chr = samfile.getrname(old_alignment.rnext) + read2_strand = '-' if old_alignment.mate_is_reverse else '+' + read2_start = old_alignment.pnext + read2_end = read2_start + 1 + read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand) + # [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )] + except: + read2_maps = None + + #------------------------------------ + # both R1 and R2 unmapped + #------------------------------------ + if read2_maps is None: + NN += 1 + if addtag: old_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(old_alignment) + continue + + #------------------------------------ + # R1 unmapped, R2 unique + #------------------------------------ + elif len(read2_maps) == 2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's + # 4 + new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NU += 1 + if addtag: new_alignment.set_tag(tag="NU", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 unmapped, R2 multiple + #------------------------------------ + else: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] + # 4 + new_alignment.pos = read2_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NM += 1 + if addtag: new_alignment.set_tag(tag="NM", value=0) + OUT_FILE.write(new_alignment) + continue + + #================================== + # R1 is originally mapped + #================================== + else: + try: + read1_chr = samfile.getrname(old_alignment.tid) + read1_strand = '-' if old_alignment.is_reverse else '+' + read1_start = old_alignment.pos + read1_end = old_alignment.aend + read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) + except: + read1_maps = None + + if not old_alignment.mate_is_unmapped: + try: + read2_chr = samfile.getrname(old_alignment.rnext) + read2_strand = '-' if old_alignment.mate_is_reverse else '+' + read2_start = old_alignment.pnext + read2_end = read2_start + 1 + read2_maps = map_coordinates(mapping, read2_chr, read2_start, read2_end, read2_strand) + # [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' )] + except: + read2_maps = None + + + #------------------------------------ + # R1 unmapped (failed to liftover) + #------------------------------------ + if read1_maps is None: + # 2 update flag (0x4: segment unmapped) + new_alignment.flag = new_alignment.flag | 0x4 + # 3 + new_alignment.tid = -1 + # 4 + new_alignment.pos = 0 + # 5 + new_alignment.mapq = 255 + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + + # (1) read2 is unmapped before conversion + if old_alignment.mate_is_unmapped: + NN += 1 + if addtag: new_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) read2 is unmapped after conversion + elif read2_maps is None: + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + + NN += 1 + if addtag: new_alignment.set_tag(tag="NN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) read2 is unique mapped + elif len(read2_maps) == 2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] #recommend to set the RNAME of unmapped read to its mate's + # 4 + new_alignment.pos = read2_maps[1][1] #recommend to set the POS of unmapped read to its mate's + # 5 + new_alignment.mapq = old_alignment.mapq + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NU += 1 + if addtag: new_alignment.set_tag(tag="NU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (4) read2 is multiple mapped + else: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 3 + new_alignment.tid = name_to_id[read2_maps[1][0]] + # 4 + new_alignment.pos = read2_maps[1][1] + # 5 + new_alignment.mapq = 255 # mapq not available + # 6 + new_alignment.cigar = old_alignment.cigar + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = 0 + + NM += 1 + if addtag:new_alignment.set_tag(tag="NM", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 uniquely mapped + #------------------------------------ + elif len(read1_maps) == 2: + + if read1_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + + if read1_maps[0][3] != read1_maps[1][3]: # opposite strand + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + elif read1_maps[0][3] == read1_maps[1][3]: # same strand + # 6 + new_alignment.cigar = old_alignment.cigar + + # 3 + new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom + # 4 + new_alignment.pos = read1_maps[1][1] #start + # 5 + new_alignment.mapq = old_alignment.mapq + + # (1) R2 unmapped before or after conversion + if (old_alignment.mate_is_unmapped) or (read2_maps is None): + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + # 7 + new_alignment.rnext = name_to_id[read1_maps[1][0]] + # 8 + new_alignment.pnext = read1_maps[1][1] + # 9 + new_alignment.tlen = 0 + + UN += 1 + if addtag: new_alignment.set_tag(tag="UN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) R2 is unique mapped + elif len(read2_maps)==2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + # 2 + if (read2_maps[1][3] != read1_maps[1][3]) and (new_alignment.tlen <= IS_size + fold * IS_std) and (new_alignment.tlen >= IS_size - fold * IS_std): + new_alignment.flag = new_alignment.flag | 0x2 + + UU += 1 + if addtag: new_alignment.set_tag(tag="UU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) R2 is multiple mapped + else: + # 2 (strand) + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 (secondary alignment) + new_alignment.flag = new_alignment.flag | 0x100 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + UM += 1 + if addtag: new_alignment.set_tag(tag="UM", value=0) + OUT_FILE.write(new_alignment) + continue + + #------------------------------------ + # R1 multiple mapped + #----------------------------------- + elif len(read1_maps) > 2 and len(read1_maps) % 2 ==0: + # 2 + new_alignment.flag = new_alignment.flag | 0x100 + # 2 + if read1_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + + if read1_maps[0][3] != read1_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + elif read1_maps[0][3] == read1_maps[1][3]: + new_alignment.cigar = old_alignment.cigar + # 3 + new_alignment.tid = name_to_id[read1_maps[1][0]] #chrom + # 4 + new_alignment.pos = read1_maps[1][1] #start + # 5 + new_alignment.mapq = 255 + + + # (1) R2 is unmapped + if (old_alignment.mate_is_unmapped) or (read2_maps is None): + # 2 + new_alignment.flag = new_alignment.flag | 0x8 + # 7 + new_alignment.rnext = name_to_id[read1_maps[1][0]] + # 8 + new_alignment.pnext = read1_maps[1][1] + # 9 + new_alignment.tlen = 0 + + MN += 1 + if addtag: new_alignment.set_tag(tag="MN", value=0) + OUT_FILE.write(new_alignment) + continue + + # (2) read2 is unique mapped + elif len(read2_maps)==2: + # 2 + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + MU += 1 + if addtag: new_alignment.set_tag(tag="MU", value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) R2 is multiple mapped + else: + # 2 (strand) + if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 + # 2 (secondary alignment) + new_alignment.flag = new_alignment.flag | 0x100 + # 7 + new_alignment.rnext = name_to_id[read2_maps[1][0]] #chrom + # 8 + new_alignment.pnext = read2_maps[1][1] #start + # 9 + new_alignment.tlen = abs(new_alignment.pos - new_alignment.pnext) -1 - old_alignment.alen + + MM += 1 + if addtag: new_alignment.set_tag(tag="MM", value=0) + OUT_FILE.write(new_alignment) + continue + else: + #old_alignment.tid = name_to_id[samfile.getrname(old_alignment.tid)] + OUT_FILE_UNMAP.write(old_alignment) + failed += 1 + + # Singel end sequencing + else: + # (1) originally unmapped + if old_alignment.is_unmapped: + SN += 1 + if addtag: old_alignment.set_tag(tag="SN",value=0) + OUT_FILE.write(old_alignment) + continue + else: + new_alignment.flag = 0x0 + read_chr = samfile.getrname(old_alignment.tid) + read_strand = '-' if old_alignment.is_reverse else '+' + read_start = old_alignment.pos + read_end = old_alignment.aend + read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) + + # (2) unmapped afte liftover + if read_maps is None: + # 1 + new_alignment.qname = old_alignment.qname + # 2 + new_alignment.flag = new_alignment.flag | 0x4 + # 3 + new_alignment.tid = -1 + # 4 + new_alignment.pos = 0 + # 5 + new_alignment.mapq = 255 + # 6 + new_alignment.cigar= old_alignment.cigar + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + # 12 + new_alignment.tags = old_alignment.tags + + SN += 1 + if addtag: new_alignment.set_tag(tag="SN",value=0) + OUT_FILE.write(new_alignment) + continue + + # (3) unique mapped + if len(read_maps)==2: + # 1 + new_alignment.qname = old_alignment.qname + # 2 + if read_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + if read_maps[0][3] != read_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + else: + # 6 + new_alignment.cigar = old_alignment.cigar + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + + # 3 + new_alignment.tid = name_to_id[read_maps[1][0]] + # 4 + new_alignment.pos = read_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + SU += 1 + if addtag: new_alignment.set_tag(tag="SU",value=0) + OUT_FILE.write(new_alignment) + continue + + # (4) multiple mapped + if len(read_maps) > 2 and len(read_maps) % 2 ==0: + + # 1 + new_alignment.qname = old_alignment.qname + new_alignment.flag = new_alignment.flag | 0x100 + # 2 + if read_maps[1][3] == '-': + new_alignment.flag = new_alignment.flag | 0x10 + if read_maps[0][3] != read_maps[1][3]: + # 6 + new_alignment.cigar = old_alignment.cigar[::-1] #reverse cigar tuple + # 10 + new_alignment.seq = revcomp_DNA(old_alignment.seq) #reverse complement read sequence + # 11 + new_alignment.qual = old_alignment.qual[::-1] #reverse quality string + else: + # 6 + new_alignment.cigar = old_alignment.cigar + # 10 + new_alignment.seq = old_alignment.seq + # 11 + new_alignment.qual = old_alignment.qual + + # 3 + new_alignment.tid = name_to_id[read_maps[1][0]] + # 4 + new_alignment.pos = read_maps[1][1] + # 5 + new_alignment.mapq = old_alignment.mapq + # 7 + new_alignment.rnext = -1 + # 8 + new_alignment.pnext = 0 + # 9 + new_alignment.tlen = 0 + + SM += 1 + if addtag: new_alignment.set_tag(tag="SM",value=0) + OUT_FILE.write(new_alignment) + continue + except StopIteration: + printlog(["Done!"]) + OUT_FILE.close() + + if outfile_prefix is not None: + if bam_format: + try: + printlog (['Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')]) + pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') + except: + printlog(["Warning: ","output BAM file was NOT sorted"]) + try: + printlog (['Index "%s" ...' % (outfile_prefix + '.sorted.bam')]) + pysam.index(outfile_prefix + '.sorted.bam',outfile_prefix + '.sorted.bam.bai') + except: + printlog(["Warning: ","output BAM file was NOT indexed."]) + + print("Total alignments:" + str(total_item-1)) + print("\tQC failed: " + str(QF)) + if max(NN,NU, NM, UN, UU, UM, MN, MU, MM) > 0: + + print("\tR1 unique, R2 unique (UU): " + str(UU)) + print("\tR1 unique, R2 unmapp (UN): " + str(UN)) + print("\tR1 unique, R2 multiple (UM): " + str(UM)) + + print("\tR1 multiple, R2 multiple (MM): " + str(MM)) + print("\tR1 multiple, R2 unique (MU): " + str(MU)) + print("\tR1 multiple, R2 unmapped (MN): " + str(MN)) + + print("\tR1 unmap, R2 unmap (NN): " + str(NN)) + print("\tR1 unmap, R2 unique (NU): " + str(NU)) + print("\tR1 unmap, R2 multiple (NM): " + str(NM)) + if max(SN,SU,SM) > 0: + print("\tUniquley mapped (SU): " + str(SU)) + print("\tMultiple mapped (SM): " + str(SM)) + print("\tUnmapped (SN): " + str(SN)) + + +def crossmap_wig_file(mapping, in_file, out_prefix, taget_chrom_size, in_format, binSize=100000): + ''' + Convert genome coordinates (in wiggle/bigwig format) between assemblies. + wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html + bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html + ''' + + OUT_FILE1 = open(out_prefix + '.bgr','w') # original bgr file + OUT_FILE2 = open(out_prefix + '.sorted.bgr','w') # sorted bgr file + OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w") # bigwig file + + if in_format.upper() == "WIGGLE": + printlog (["Liftover wiggle file:", in_file, '==>', out_prefix + '.bgr']) + + for chr, start, end, strand, score in wiggleReader (in_file): + #print chr,start,end,score + # map_coordinates(mapping, q_chr, q_start, q_end, q_strand = '+', print_match=False): + maps = map_coordinates(mapping, chr, start, end, '+') + if maps is None: + continue + if len(maps) == 2: + print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1) + else: + continue + maps[:]=[] + OUT_FILE1.close() + + printlog (["Merging overlapped entries in bedGraph file ..."]) + for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): + print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2) + OUT_FILE2.close() + + os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr + + # add header to bigwig file + printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ]) + target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())] + OUT_FILE3.addHeader(target_chroms_sorted) + + printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ]) + # add entries to bigwig file + for line in ireader.reader(out_prefix + '.sorted.bgr'): + r_chr,r_st,r_end,r_value = line.split() + OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) + + OUT_FILE3.close() + + elif in_format.upper() == "BIGWIG": + + printlog (["Liftover bigwig file:", in_file, '==>', out_prefix + '.bgr']) + for chr, start, end, score in bigwigReader(in_file, bin_size = binSize ): + maps = map_coordinates(mapping, chr, start, end, '+') + try: + if maps is None: continue + if len(maps) == 2: + print('\t'.join([str(i) for i in [maps[1][0],maps[1][1],maps[1][2], score]]), file=OUT_FILE1) + else: + continue + except: + continue + maps[:]=[] + OUT_FILE1.close() + + printlog (["Merging overlapped entries in bedGraph file ..."]) + for (chr, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): + print('\t'.join([str(i) for i in (chr, start, end, score )]), file=OUT_FILE2) + OUT_FILE2.close() + os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr + + # add header to bigwig file + printlog (["Writing header to \"%s\" ..." % (out_prefix + '.bw') ]) + target_chroms_sorted = [(k,taget_chrom_size[k]) for k in sorted(taget_chrom_size.keys())] + OUT_FILE3.addHeader(target_chroms_sorted) + + # add entries to bigwig file + printlog (["Writing entries to \"%s\" ..." % (out_prefix + '.bw') ]) + for line in ireader.reader(out_prefix + '.sorted.bgr'): + r_chr,r_st,r_end,r_value = line.split() + OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) + + OUT_FILE3.close() + else: + raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'") + + +def general_help(): + desc="""CrossMap is a program for convenient conversion of genome coordinates and genome \ +annotation files between assemblies (eg. lift from human hg18 to hg19 or vice versa). \ +It supports file in BAM, SAM, BED, Wiggle, BigWig, GFF, GTF and VCF format.""" + + print("Program: %s (v%s)" % ("CrossMap", __version__), file=sys.stderr) + print("\nDescription: \n%s" % '\n'.join(' '+i for i in wrap(desc,width=80)), file=sys.stderr) + print("\nUsage: CrossMap.py [options]\n", file=sys.stderr) + for k in sorted(commands): + print(' ' + k + '\t' + commands[k], file=sys.stderr) + print(file=sys.stderr) + +def bed_help(): + msg =[ + ('Usage:', "CrossMap.py bed input_chain_file input_bed_file [output_file]"), + ('Description:', "\"input_chain_file\" and \"input_bed_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. BED format file must have at least 3 columns (chrom, start, end) and no more than 12 columns. If no \"output_file\" is specified, output will be directed to the screen (console). BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1"), + ('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed test.hg19.bed # write output to \"test.hg19.bed\""), + ('Example:', "CrossMapy.py bed hg18ToHg19.over.chain.gz test.hg18.bed # write output to screen"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def gff_help(): + msg =[ + ('Usage:', "CrossMap.py gff input_chain_file input_gff_file output_file"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Input file must be in GFF or GTF format. GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4"), + ('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf test.hg18.gtf # write output to \"test.hg18.gff\""), + ('Example:', "CrossMap.py gff hg19ToHg18.over.chain.gz test.hg19.gtf # write output to screen"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def wig_help(): + msg =[ + ('Usage:', "CrossMap.py wig input_chain_file input_wig_file output_prefix"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Both \"variableStep\" and \"fixedStep\" wiggle lines are supported. Wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html"), + ('Example:', "CrossMapy.py wig hg18ToHg19.over.chain.gz test.hg18.wig test.hg19"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) +def bigwig_help(): + msg =[ + ('Usage:', "CrossMap.py bigwig input_chain_file input__bigwig_file output_prefix"), + ('Description:', "\"input_chain_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. Bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html"), + ('Example:', "CrossMapy.py bigwig hg18ToHg19.over.chain.gz test.hg18.bw test.hg19"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + +def bam_help(): + usage="CrossMap.py bam -a input_chain_file input_bam_file output_file [options] " + import optparse + parser.print_help() + +def vcf_help(): + msg =[ + ("usage:","CrossMap.py vcf input_chain_file input_VCF_file ref_genome_file output_file"), + ("Description:", "\"input_chain_file\" and \"input_VCF_file\" can be regular or compressed (*.gz, *.Z, *.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. \"ref_genome_file\" is genome sequence file of 'target assembly' in FASTA format."), + ("Example:", " CrossMap.py vcf hg19ToHg18.over.chain.gz test.hg19.vcf hg18.fa test.hg18.vcf"), + ] + for i,j in msg: + print('\n' + i + '\n' + '\n'.join([' ' + k for k in wrap(j,width=80)]), file=sys.stderr) + + +if __name__=='__main__': + + # test read_chain_file() + #(mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(sys.argv[1], print_table=True) + + #q_chr, q_start, q_end, q_strand = '+', print_match=False): + #map_coordinates(mapTree, 'chr1', 45953452, 45953456, '+', print_match=True) + + + #a= sam_header.bam_header_generator(chrom_size = targetChromSizes, prog_name="CrossMap",prog_ver = __version__, format_ver=1.0,sort_type = 'coordinate',co=['a','b','c']) + #crossmap_bed_file(mapTree, 'test.hg18.bed3',outfile='aa') + #crossmap_bed_file(mapTree, 'test.bed') + #crossmap_gff_file(mapTree,'test.hg18.gtf',outfile = 'aa') + + + #crossmap_bam_file(mapTree, targetChromSizes, sys.argv[1], 'eg.abnormal.sam','out') + #printlog (['Sort output bam file ...']) + #pysam.sort('-m','1000000000','out.bam','out.sorted') + #printlog (['Index bam file ...']) + #pysam.index('out.sorted.bam','out.sorted.bam.bai') + + #crossmap_wig_file(mapTree, 'GSM686950_reverse.hg18.bw', 'GSM686950_reverse.hg19', sourceChromSizes, targetChromSizes, in_format = 'bigwig') + + #parser = optparse.OptionParser() + #parser.add_option("-i","--input-sam",action="store",type="string",dest="input_sam_file", help="BAM/SAM format file.") + #parser.add_option("-o","--out-file",action="store",type="string",dest="output_sam_file", help="Prefix of output files. Foramt is determined from input: SAM <=> SAM, BAM <=> BAM") + #parser.add_option("-m","--insert-mean",action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). Used to tell wheather a mapped pair is \"proper pair\" or not. [default=%default]") + #parser.add_option("-s","--insert-stdev",action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" ) + #parser.add_option("-t","--times-of-stdev",action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]") + #(options,args)=parser.parse_args() + + #print options + #print args + + + commands = { + 'bed':'convert genome cooridnate or annotation file in BED, bedGraph or other BED-like format.', + 'bam':'convert alignment file in BAM or SAM format.', + 'gff':'convert genome cooridnate or annotation file in GFF or GTF format.', + 'wig':'convert genome coordinate file in Wiggle, or bedGraph format.', + 'bigwig':'convert genome coordinate file in BigWig format.', + 'vcf':'convert genome coordinate file in VCF format.' + } + kwds = list(commands.keys()) + + + if len(sys.argv) == 1: + general_help() + sys.exit(0) + elif len(sys.argv) >=2: + # deal with bed input + if sys.argv[1].lower() == 'bed': + if len(sys.argv) == 4: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = None + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file, print_table = False) + crossmap_bed_file(mapTree, in_file, out_file) + + elif len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_bed_file(mapTree, in_file, out_file) + else: + bed_help() + sys.exit(0) + elif sys.argv[1].lower() == 'gff': + if len(sys.argv) == 4: + chain_file = sys.argv[2] + in_file = sys.argv[3] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_gff_file(mapTree, in_file, None) + elif len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_gff_file(mapTree, in_file, out_file) + else: + gff_help() + sys.exit(0) + elif sys.argv[1].lower() == 'wig': + if len(sys.argv) == 5: + chain_file = sys.argv[2] + in_file = sys.argv[3] + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'wiggle') + else: + wig_help() + sys.exit(0) + elif sys.argv[1].lower() == 'bigwig': + if len(sys.argv) == 5: + chain_file = sys.argv[2] + + in_file = sys.argv[3] + try: + bw = pyBigWig.open(in_file) + except: + print ("\nPlease check if \"%s\" is in bigWig format!\n" % in_file, file=sys.stderr) + sys.exit(0) + + out_file = sys.argv[4] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_wig_file(mapTree, in_file, out_file, targetChromSizes, in_format = 'bigwig') + else: + bigwig_help() + sys.exit(0) + elif sys.argv[1].lower() == 'bam': + #def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30, fold=3): + usage="CrossMap.py bam input_chain_file input_bam_file output_file [options]\nNote: If output_file == STDOUT or -, CrossMap will write BAM file to the screen" + parser = optparse.OptionParser(usage, add_help_option=False) + parser.add_option("-m", "--mean", action="store",type="float",dest="insert_size", default=200.0, help="Average insert size of pair-end sequencing (bp). [default=%default]") + parser.add_option("-s", "--stdev", action="store",type="float",dest="insert_size_stdev", default=30.0, help="Stanadard deviation of insert size. [default=%default]" ) + parser.add_option("-t", "--times", action="store",type="float",dest="insert_size_fold", default=3.0, help="A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then '-t' * stdev from the mean. [default=%default]") + parser.add_option("-a","--append-tags",action="store_true",dest="add_tags",help="Add tag to each alignment.") + (options,args)=parser.parse_args() + + if len(args) >= 3: + print("Insert size = %f" % (options.insert_size), file=sys.stderr) + print("Insert size stdev = %f" % (options.insert_size_stdev), file=sys.stderr) + print("Number of stdev from the mean = %f" % (options.insert_size_fold), file=sys.stderr) + if options.add_tags: + print("Add tags to each alignment = %s" % ( options.add_tags), file=sys.stderr) + else: + print("Add tags to each alignment = %s" % ( False), file=sys.stderr) + chain_file = args[1] + in_file = args[2] + out_file = args[3] if len(args) >= 4 else None + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + + if out_file in ["STDOUT","-"]: + out_file = None + if options.add_tags: + crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=True) + else: + crossmap_bam_file(mapping = mapTree, chainfile = chain_file, infile = in_file, outfile_prefix = out_file, chrom_size = targetChromSizes, IS_size=options.insert_size, IS_std=options.insert_size_stdev, fold=options.insert_size_fold,addtag=False) + else: + parser.print_help() + elif sys.argv[1].lower() == 'vcf': + if len(sys.argv) == 6: + chain_file = sys.argv[2] + in_file = sys.argv[3] + genome_file = sys.argv[4] + out_file = sys.argv[5] + (mapTree,targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) + crossmap_vcf_file(mapping = mapTree, infile= in_file, outfile = out_file, liftoverfile = sys.argv[2], refgenome = genome_file) + else: + vcf_help() + sys.exit(0) + else: + general_help() + sys.exit(0) + + + + + diff --git a/bin/Modules/CrossMapper.py b/bin/Modules/CrossMapper.py new file mode 100644 index 0000000..d401c79 --- /dev/null +++ b/bin/Modules/CrossMapper.py @@ -0,0 +1,49 @@ +import urllib.request +import os +from Modules import CrossMap + + +class CrossMapper: + + """ + + Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions. + Utilizes CrossMap.py. see wiki for more information. + + """ + + def __init__(self, org, wd, out, is_dir): + self.org = org + if is_dir: + self.infile = os.path.join( wd + "/temp/" + org + ".gtf") + else: + self.infile = os.path.join(wd+"/data/temp/"+org+".gtf") + self.outfile = os.path.join(out+"/" + org + "_mapped.gtf") + self.chainfile = self.get_chain_file(org, wd, is_dir) + # Execute Crossmapper for gff/gtf files + (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile) + CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile) + + def get_chain_file(self, org, wd, isdir): + + # Defines the Chain files for different conversions. + + if org == "hg19": + if isdir: + file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz") + else: + file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) + urllib.request.urlretrieve("http://hgdownload.soe.ucsc.edu/goldenPath/hg38/" + "liftOver/hg38ToHg19.over.chain.gz", + file_link) + return file_link + + elif org == "mm9": + if isdir: + file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz") + else: + file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) + urllib.request.urlretrieve("http://hgdownload.soe.ucsc.edu/goldenPath/mm10/" + "liftOver/mm10ToMm9.over.chain.gz", + file_link) + return file_link diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/Modules/Ensembl/ActivityCategorizer.py index 8fee867..bd82a92 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/Modules/Ensembl/ActivityCategorizer.py @@ -4,7 +4,7 @@ class ActivityCategorizer: - def __init__(self, release, organism): + def __init__(self, release, organism, wd, data_dir): # List of all Folders with Activity Tables @@ -12,13 +12,13 @@ def __init__(self, release, organism): # Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner] - self.c_dict = self.read_config(organism) + self.c_dict = self.read_config(organism, wd) # Activity table from all files as dict self.activity = {} - self.get_activity_data(release, organism) + self.get_activity_data(release, organism, wd, data_dir) # Categorized Activity from Json-config print("Categorization: This may take a while") @@ -29,23 +29,26 @@ def __init__(self, release, organism): def get_categorization(self): return self.categorization - def read_config(self, organism): + def read_config(self, organism, wd): c_dict = {} - path_to_config = os.path.join("./config/celltypes_"+organism+".json") - with open(path_to_config) as input: - data = json.loads(input.read()) + path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json") + with open(path_to_config) as input_file: + data = json.loads(input_file.read()) for x in data: c_dict[x["type"]] = x["alias_ensembl"] self.folderlist.extend(x["alias_ensembl"]) return c_dict - def get_activity_data(self, release, organism): + def get_activity_data(self, release, organism, wd, data_dir): for folder in self.folderlist: # Generate path to binary File - file = os.path.join("./EnsemblData", release, organism, "activity", folder, "table.bin") + if data_dir: + file = os.path.join(data_dir + "/EnsemblData", release, organism, "activity", folder, "table.bin") + else: + file = os.path.join(wd + "/data/EnsemblData", release, organism, "activity", folder, "table.bin") with open(file, "rb") as tables: self.activity[folder] = bytearray(tables.read()) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/Modules/Ensembl/ActivityTable.py index 042421a..288bf7d 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/Modules/Ensembl/ActivityTable.py @@ -6,8 +6,8 @@ class ActivityTable: """ Class for checking activity_table and generating them. - ActivityTable = Byte Representation of Activity Status - corresponding to the generator Schema default: + activityTable = byte Representation of activity status + corresponding to the generator schema default: 0, "activity=ACTIVE", 1, "activity=POISED", 2, "activity=REPRESSED", @@ -15,8 +15,11 @@ class ActivityTable: 4, "activity=NA" """ - def __init__(self, organism, current_release): - self.link = os.path.join("./EnsemblData/", current_release, organism, "activity") + def __init__(self, organism, current_release, wd, data_dir): + if data_dir: + self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity") + else: + self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", @@ -25,6 +28,7 @@ def __init__(self, organism, current_release): "activity=NA"]) def check_and_generate_activity_table(self): + # checks if file already exists and generates new one if missing for subfolder in self.folders: folder_link = os.path.join(self.link, subfolder) sf_link = os.path.join(folder_link, "table.bin") @@ -34,6 +38,7 @@ def check_and_generate_activity_table(self): print("All ActivityTables found, proceeding") def generate_table(self, link): + # generates the table and saves it as table.bin file for root, dirs, files in os.walk(link): for file in files: if file.endswith(".gff.gz"): diff --git a/bin/Modules/Ensembl/ActivityTableGenerator.py b/bin/Modules/Ensembl/ActivityTableGenerator.py index 2c2cff6..f110236 100644 --- a/bin/Modules/Ensembl/ActivityTableGenerator.py +++ b/bin/Modules/Ensembl/ActivityTableGenerator.py @@ -3,6 +3,10 @@ class ATGenerator: + """ + Reads saved activity binary files (table.bin) + """ + def __init__(self, repre): self.representation = repre diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/Modules/Ensembl/Ensembl.py index df9cbc0..88adeb1 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/Modules/Ensembl/Ensembl.py @@ -6,15 +6,20 @@ class Ensembl: - def __init__(self, organism): + """ + Main class for handling Ensembl Regulatory data + Checks for local files and downloads if files are missing + """ + + def __init__(self, organism, wd, data_dir): print("Starting Ensembl") - self.updater = FTPRetriever(organism) + self.updater = FTPRetriever(organism, wd, data_dir) self.release = self.updater.get_release() - self.acttable = ActivityTable(organism, self.release) + self.acttable = ActivityTable(organism, self.release, wd, data_dir) self.acttable.check_and_generate_activity_table() - self.categorizer = ActivityCategorizer(self.release, organism) + self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir) print("Generating GTF") - self.gtf_generator = GTFGen(organism, self.release) + self.gtf_generator = GTFGen(organism, self.release, wd, data_dir) print("Ensembl Finished !") diff --git a/bin/Modules/Ensembl/FTPHandling/FTPEntry.py b/bin/Modules/Ensembl/FTPHandling/FTPEntry.py index 0a90080..0d7bdc3 100644 --- a/bin/Modules/Ensembl/FTPHandling/FTPEntry.py +++ b/bin/Modules/Ensembl/FTPHandling/FTPEntry.py @@ -1,11 +1,12 @@ import ftplib -# Class to determine if a Ftp-path is file or directory. - - class FTPEntry: + """ + Class to determine if a ftp-path is file or directory. + """ + def __init__(self, filename, ftpobj, startingdir=None): self.filename = filename if startingdir is None: diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py index 598a6f5..6b45598 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -4,6 +4,10 @@ class FTPHandler: + """ + Class to browse through ftp folders and download entries to local file + """ + def __init__(self, url, wd): self.ftp = ftplib.FTP(url) self.ftp.login() @@ -15,8 +19,8 @@ def change_dir(self, wd): def get_all_entries(self): return self.ftp.nlst() - def get_all_entries_from_dir(self, dir): - self.change_dir(dir) + def get_all_entries_from_dir(self, dire): + self.change_dir(dire) return self.get_all_entries() def get_all_entries_as_FTPEntry(self): diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py index 874e776..d4c1ca6 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -9,12 +9,12 @@ class EnsemblRegulationFTPRetriever: And downloading newest version if necessary """ - def __init__(self, organism): + def __init__(self, organism, wd, data_dir): self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") self.remoteversion = self.get_current_ftp_version() - self.localversion = self.get_current_local_version() - if self.check_version_difference(organism): - self.download_currentversion_version(self.remoteversion, organism) + self.localversion = self.get_current_local_version(wd, data_dir) + if self.check_version_difference(organism, wd, data_dir): + self.download_currentversion_version(self.remoteversion, organism, wd, data_dir) else: print("Newest Version installed, no update needed.") @@ -31,15 +31,25 @@ def get_current_ftp_version(self): print("Current release is "+c_release) return c_release - def check_organism(self, organism, release): - if organism in next(os.walk("./EnsemblData/"+release+"/"))[1]: - return False + def check_organism(self, organism, release, wd, data_dir): + if data_dir: + if organism in next(os.walk(os.path.join(data_dir+"/EnsemblData/"+release+"/")))[1]: + return False + else: + print("No Local Version for "+organism+" installed. Installing...") + return True else: - print("No Local Version for "+organism+" installed. Installing...") - return True + if organism in next(os.walk(os.path.join(wd+"/data/EnsemblData/"+release+"/")))[1]: + return False + else: + print("No Local Version for "+organism+" installed. Installing...") + return True - def get_current_local_version(self): - directories = next(os.walk("./EnsemblData/"))[1] + def get_current_local_version(self, wd, data_dir): + if data_dir: + directories = next(os.walk(os.path.join(data_dir + "/EnsemblData/")))[1] + else: + directories = next(os.walk(os.path.join(wd+"/data/EnsemblData/")))[1] for dir in directories: if "release" in dir: localversion = sorted(directories, reverse=True)[0] @@ -51,7 +61,7 @@ def get_current_local_version(self): print("No Version installed !") return None - def check_version_difference(self, organism): + def check_version_difference(self, organism, wd, data_dir): local_version = self.localversion remote_version = self.remoteversion @@ -64,16 +74,18 @@ def check_version_difference(self, organism): print("Outdated Version detected ! local: " + local_version + " remote: " + remote_version) return True else: - if self.check_organism(organism, local_version): + if self.check_organism(organism, local_version, wd, data_dir): return True else: return False - def download_currentversion_version(self, version, organism): + def download_currentversion_version(self, version, organism, wd, data_dir): # Download Base File - - targetfolder = os.path.join("./EnsemblData/", version, organism) + if data_dir: + targetfolder = os.path.join(data_dir + "/EnsemblData/", version, organism) + else: + targetfolder = os.path.join(wd+"/data/EnsemblData/", version, organism) os.makedirs(targetfolder) folder_url = "/pub/"+version+"/regulation/"+organism+"/" self.site_ftp.change_dir(folder_url) diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/Modules/Ensembl/GTFGen.py index a4de58a..ca57aa8 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/Modules/Ensembl/GTFGen.py @@ -5,14 +5,24 @@ class GTFGen: - def __init__(self, organism, release): - self.gff_lines = self.get_organism_as_gff(organism, release) + """ + + Class to generate Ensembl GTF-data with activity + """ + + def __init__(self, organism, release, wd, data_dir): + + self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} - def get_organism_as_gff(self, organism, release): + def get_organism_as_gff(self, organism, release, wd, data_dir): - directory = os.path.join("./EnsemblData/", release, organism) + # reads the original gff file for organism + if data_dir: + directory = os.path.join(data_dir + "/EnsemblData/", release, organism) + else: + directory = os.path.join(wd + "/data/EnsemblData/", release, organism) inputfile = "" for file in os.listdir(directory): if file.endswith("gff.gz"): @@ -23,6 +33,8 @@ def get_organism_as_gff(self, organism, release): def reformat_to_gff(self, activity, release): + # Reformats gff to gtf and appends activity-data for config specified celltype-categories + gtf_return = [] for index, line in enumerate(self.gff_lines): @@ -39,7 +51,7 @@ def reformat_to_gff(self, activity, release): # Add RegBuild_ + release templist.append("RegBuild_"+release) # Add Description from Description in last ; separated segment - templist.append(splitted_additional[4].split("=")[1]) + templist.append(splitted_additional[4].split("=")[1].lower()) # Add Start / End Data from original templist.extend(splitted[3:5]) # Add Score, Strand and Frame Data @@ -54,17 +66,25 @@ def reformat_to_gff(self, activity, release): return gtf_return @staticmethod - def generate_additional_information(id, activity): - return "; ".join([id, "activity="+", ".join(activity)]) + def generate_additional_information(gene_id, activity): + + if gene_id.startswith("ID=regulatory_region:"): + gene_id = 'ID "'+gene_id.split(':')[1]+'"' + + activity_string = 'activity "'+', '.join(activity)+'"' + + # helper method to concat activity information to string + return gene_id+'; '+activity_string def generate_activity_list(self, activity, index): + # generates activity list activity_list = [] for key, value in activity.items(): activity_list.append(key+">"+self.value_map[value[index]]) return activity_list def get_gtf(self, release, activity): - + # returns the resulting gtf-formatted-list return self.reformat_to_gff(activity, release) diff --git a/bin/Modules/Ensembl/checksums.py b/bin/Modules/Ensembl/checksums.py index 55209f9..8f8b92a 100644 --- a/bin/Modules/Ensembl/checksums.py +++ b/bin/Modules/Ensembl/checksums.py @@ -2,6 +2,10 @@ # Python implementation of linux sum (BSD 16-bit Checksum) commandline tool. +""" +Unused script with checksum implementations in Python +""" + def bsdchecksum(infile): with open(infile, 'rb') as f: file_bytes = f.read() @@ -17,11 +21,3 @@ def md5_checksum(infile): with open(infile, 'rb') as f: file_bytes = f.read() return hashlib.md5(file_bytes).hexdigest() - - -if __name__ == '__main__': - # print(bsdchecksum("/home/basti/Schreibtisch/tests/homo_sapiens.GRCh38.HMEC.Regulatory_Build.regulatory_activity.20161111.gff.gz")) - print(md5_checksum("/home/basti/Schreibtisch/tests/" - "mus_musculus.GRCm38.forebrain_embryonic_16_5_days." - "Regulatory_Build.regulatory_activity.20180516.gff.gz")) - diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py index 2dddda7..66c2d97 100644 --- a/bin/Modules/SaveResults.py +++ b/bin/Modules/SaveResults.py @@ -1,19 +1,29 @@ -import csv import os class ResultSaver: - def __init__(self, results, organism, tissue): + """ + + class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False. + The output is saved to the temp directory in the data folder if crossmapping is necessary. + + """ + + def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): print("Save results to File !") self.path = "" - if tissue: - self.path = os.path.join("./results/"+organism+"_filtered.gtf") + if mapped: + if is_data_dir: + self.path = os.path.join(wd + "/temp/" + organism + ".gtf") + else: + self.path = os.path.join( wd + "/data/temp/" + organism + ".gtf" ) + elif tissue: + self.path = os.path.join(out+"/"+organism+"_filtered.gtf") else: - self.path = os.path.join("./results/"+organism+".gtf") + self.path = os.path.join(out+"/"+organism+".gtf") - with open(self.path, "w") as file: - write_it = csv.writer(file, delimiter='\t') + with open(self.path, "w") as output_file: for line in results: - write_it.writerow(line) + output_file.write("\t".join(line)+"\n") diff --git a/bin/Modules/Uniquifier.py b/bin/Modules/Uniquifier.py index 7d4925d..16d5538 100644 --- a/bin/Modules/Uniquifier.py +++ b/bin/Modules/Uniquifier.py @@ -1,5 +1,11 @@ class UniqueFilter: + """ + + Class to get unique GTF-results, filtered by specified cell-/tissuetypes + + """ + def __init__(self, ense, ucsc, org_filter=None): self.results = self.get_filtered_results(org_filter, ense, ucsc) @@ -7,6 +13,9 @@ def get_results(self): return self.results def get_filtered_results(self, org_filter, ense, ucsc): + + # Apply Filter + unfiltered_results = self.concat_without_duplicates(ense, ucsc) if org_filter: filterstrings = [x+">ACTIVE" for x in org_filter] @@ -21,6 +30,9 @@ def get_filtered_results(self, org_filter, ense, ucsc): @staticmethod def concat_without_duplicates(ense, ucsc): + + # Concat UCSC and Ensembl data without duplicates + results = ense+ucsc for ens in ense: for uc in ucsc: diff --git a/bin/Modules/ucsc/bigBedToBed b/bin/Modules/ucsc/bigBedToBed old mode 100644 new mode 100755 diff --git a/bin/Modules/ucsc/ucsc.py b/bin/Modules/ucsc/ucsc.py index 4fbad78..a554a4d 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/Modules/ucsc/ucsc.py @@ -7,21 +7,33 @@ class UcscGtf: - def __init__(self, org): + """ + Class to gather ucsc refSeq-FuncElem data. + + """ + + def __init__(self, org, wd, data_dir): self.organism_id = self.get_organism_id(org) self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" - self.output = "./UCSCData/"+self.organism_id+".bed" + if data_dir: + self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") + else: + self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") + self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") + print("Path to Bin: " + self.path_to_bin) self.generate_gff_file() - self.ucsc_categories = self.get_activity_categories(org) + self.ucsc_categories = self.get_activity_categories(org, wd) self.gtf_lines = self.read_gff_to_gtf() print("UCSC finished !") def generate_gff_file(self): - callstring = ["./Modules/ucsc/bigBedToBed", self.link, self.output] + # Call bigBedToBed binary to get a Bed file in the UCSCData folder + callstring = [self.path_to_bin, self.link, self.output] subprocess.call(callstring) def read_gff_to_gtf(self): + # Reads Bed File and return a gtf-formatted list of elements. gtf_lines = [] with open(self.output, 'r') as csvfile: tsvreader = csv.reader(csvfile, delimiter='\t') @@ -29,23 +41,31 @@ def read_gff_to_gtf(self): sequence = [] sequence.append(row[0]) sequence.append("UCSC") - sequence.append(row[3]) + sequence.append(row[3].lower()) sequence.append(row[1]) sequence.append(row[2]) sequence.append(".") sequence.append(row[5]) sequence.append(".") - sequence.append("; ".join([self.find_ID("".join(row[11:])), ", ".join(self.get_activity(".".join(row[11:])))])) + sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])) gtf_lines.append(sequence) return gtf_lines def find_ID(self, line): + # Find RefSeq ID in Line pattern = re.compile(r'ID:[0-9]{,9}|$') + ref_id = re.search(pattern, line).group() + splitted = ref_id.split(":") + if len(splitted) == 2: + returnstring = str(splitted[0])+' "'+str(splitted[1])+'"' + else: + returnstring = 'ID '+'"NA"' - return re.search( pattern, line).group() + return returnstring def get_activity(self, line): + # Find activity categories in bed file key_status = [] for key, value in self.ucsc_categories.items(): if value: @@ -59,14 +79,16 @@ def get_activity(self, line): @staticmethod def get_organism_id(org): + # convert intern name e.g. "homo_sapiens" to ucsc name "hg38". if org == "homo_sapiens": return "hg38" elif org == "mus_musculus": return "mm10" @staticmethod - def get_activity_categories(organism): - path_to_config = os.path.join("./config/celltypes_" + organism + ".json") + def get_activity_categories(organism, wd): + # Method to get ucsc-celltype categories from JSON config + path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: data = json.loads(input_file.read()) @@ -77,14 +99,3 @@ def get_activity_categories(organism): def get_gtf(self): return self.gtf_lines - - def test_save_to_file(self): - - with open("./results/test.gtf", "w") as file: - write_it = csv.writer(file, delimiter='\t') - for line in self.gtf_lines: - write_it.writerow(line) - - -#u = UcscGtf("homo_sapiens") -#u.test_save_to_file() \ No newline at end of file diff --git a/bin/RegGTFExtractor.py b/bin/RegGTFExtractor.py old mode 100644 new mode 100755 index 294e539..355d4da --- a/bin/RegGTFExtractor.py +++ b/bin/RegGTFExtractor.py @@ -1,41 +1,147 @@ -#!/usr/bin/env python3 +""" + +RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases +and converts output to GTF-formatted file. + +@author: Sebastian Beyvers +@contact: sebastian.beyvers@med.uni-giessen.de + +""" import argparse from Modules.Ensembl.Ensembl import Ensembl from Modules.ucsc.ucsc import UcscGtf from Modules.Uniquifier import UniqueFilter from Modules.SaveResults import ResultSaver +from Modules.CrossMapper import CrossMapper import os +import json + + +def check_for_local_folder(wd): + + # Check if local folder exists and create if missing when no data_dir is specified + + if not os.path.isdir(os.path.join(wd+"/data/")): + + os.mkdir(os.path.join(wd+"/data/")) + + if not os.path.isdir(os.path.join(wd+"/data/EnsemblData")): + + os.mkdir(os.path.join(wd+"/data/EnsemblData")) + + if not os.path.isdir(os.path.join(wd+"/data/UCSCData")): + os.mkdir(os.path.join(wd+"/data/UCSCData")) + + if not os.path.isdir(os.path.join(wd+"/data/temp")): + os.mkdir(os.path.join(wd+"/data/temp")) + + +def check_for_data_dir(data_dir): + + # Check if local folder exists and create if missing when data_dir as parameter is specified + + if not os.path.isdir(os.path.join(data_dir)): + os.mkdir(os.path.join(data_dir)) + + if not os.path.isdir(os.path.join(data_dir+"/EnsemblData")): + os.mkdir(os.path.join(data_dir+"/EnsemblData")) + if not os.path.isdir(os.path.join(data_dir+"/UCSCData")): + os.mkdir(os.path.join(data_dir + "/UCSCData")) -def check_for_local_folder(): + if not os.path.isdir(os.path.join(data_dir+"/temp")): + os.mkdir(os.path.join(data_dir+"/temp")) - if not os.path.isdir("./EnsemblData"): - os.mkdir("./EnsemblData") +def check_filter(tissue_cmd, org, wd): - if not os.path.isdir( "./UCSCData" ): - os.mkdir( "./UCSCData" ) + # Checks if filter-celltype is in Json types for organism + path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") + tissues_config = [] + if not tissue_cmd: + return False + with open(path_to_config) as input_file: + data = json.loads(input_file.read()) + for x in data: + tissues_config.append(x["type"]) -def main_script(org, tissuetype=None): + if any(tissue in tissues_config for tissue in tissue_cmd): + return True - check_for_local_folder() - ucsc = UcscGtf(org) - ense = Ensembl(org) + else: + return False + + +def check_organism(org): + + # Checks the organism input and decides if chrossmapping is necessary + + if org == "hg38": + return "homo_sapiens", False + if org == "hg19": + print("Older assembly Version detected: hg19 -> Crossmapping result from hg38") + return "homo_sapiens", True + elif org == "mm10": + return "mus_musculus", False + elif org == "mm9": + print("Older assembly Version detected: mm9 -> Crossmapping result from mm10") + return "mus_musculus", True + + +def main_script(organism, wd, data_dir, out, tissuetype=None): + + # Main function + + (org, x_mappable) = check_organism(organism) + if not data_dir: + check_for_local_folder(wd) + else: + check_for_data_dir(data_dir) + if check_filter(tissuetype, org, wd): + tissues = tissuetype + print("Filter detected !") + else: + tissues = None + print("Filter not detected !") + + # Get UCSC Data + ucsc = UcscGtf(org, wd, data_dir) + # Gen Ensembl Data + ense = Ensembl(org, wd, data_dir) print("Getting Unique Results") - unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissuetype) - ResultSaver(unique_filter.get_results(), org, tissuetype) + unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) + if data_dir: + ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, tissues, out) + if x_mappable: + CrossMapper(organism, data_dir, out, True) + else: + ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, tissues, out) + if x_mappable: + CrossMapper(organism, wd, out, False) if __name__ == '__main__': + + # Argumentparser + parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) - parser.add_argument('organism', help='Source organism [ homo_sapiens or mus_musculus ]', action='store', nargs='?', type=str) + parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) + parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) + parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) + parser.add_argument('--out', help='Output directory: default: "."', action='store', default=".", type=str) args = vars(parser.parse_args()) + + # Check if organism exists + if args["organism"]: - #print(args["tissue"]) - main_script(args["organism"], args["tissue"]) + if args["organism"] in ["hg19", "hg38", "mm9", "mm10"]: + print("Working Dir: " + args["wd"]) + main_script(args["organism"], args["wd"], args["dir"], args["out"], args["tissue"]) + else: + print("Invalid Organism: " + args["organism"] + " see -h for help") else: - print("No Arguments found -> See ./RegGTFExtractor.py -h for help.") + print("No Arguments found -> See python3 ./RegGTFExtractor.py -h for help.") diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R index 09e3f4f..84910f3 100644 --- a/bin/bed_to_fasta.R +++ b/bin/bed_to_fasta.R @@ -1,28 +1,28 @@ -#!/usr/bin/env Rscript - -# Splitting BED-files depending on their cluster. -# The Sequences of each cluster are writen as an FASTA-file. -# @parameter bedInput BED-file with sequences and cluster-id as column"TEs -# @parameter prefix prefix for filenames -# @parameter min_seq min. number of sequences per cluster - -args = commandArgs(trailingOnly = TRUE) - -bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] - -bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - -clusters <- split(bed, bed$V8, sorted = TRUE, flatten = FALSE) # <---- Spalte mit Cluster -discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - print(nrow(clust)) - if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[7]]) # <---- Splate mit Sequenz - outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Spalte mit Name - } else { - print(paste0("Cluster: ",i," is to small")) - } -}) +#!/usr/bin/env Rscript + +# Splitting BED-files depending on their cluster. +# The Sequences of each cluster are writen as an FASTA-file. +# @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 +# @parameter prefix prefix for filenames +# @parameter min_seq min. number of sequences per cluster + +args = commandArgs(trailingOnly = TRUE) + +bedInput <- args[1] +prefix <- args[2] +min_seq <- args[3] + +bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") + +clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column +discard <- lapply(1:length(clusters), function(i){ + clust <- as.data.frame(clusters[i]) + print(nrow(clust)) + if (nrow(clust) >= as.numeric(min_seq) ) { + sequences <- as.list(clust[[10]]) # <---- sequenze column + outfile <- paste0(prefix,"_cluster_",i,".FASTA") + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column + } else { + print(paste0("Cluster: ",i," is to small")) + } +}) diff --git a/bin/call_peaks.yaml b/bin/call_peaks.yaml deleted file mode 100644 index 4d3eabd..0000000 --- a/bin/call_peaks.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: call_peaks - -channels: - - bioconda - - conda-forge - -dependencies: - - numpy - - pybigwig \ No newline at end of file diff --git a/bin/cdhit_wrapper.R b/bin/cdhit_wrapper.R new file mode 100644 index 0000000..e5ed3e4 --- /dev/null +++ b/bin/cdhit_wrapper.R @@ -0,0 +1,158 @@ +#! /bin/Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), + make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-b", "--band_width"), default = 20, help = "Alignment band width. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-M", "--memory"), default = 800, help = "Memory limit in MB. 0 for unlimited. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-T", "--threads"), default = 1, help = "Number of threads. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), + # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 + make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), + make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), + make_option(opt_str = c("-f", "--alignment_coverage_shorter_p"), default = 0.0, help = "Alignment must cover x percent of shorter sequence. Default = %default (CD-HIT parameter: -aS)", metavar = "double"), + make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), + make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), + make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), + make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.") + +opt <- parse_args(opt_parser) + +#' cd-hit-est wrapper +#' Receives bed with sequences in last column and writes bed with added cluster numbers. +#' +#' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). +#' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) +#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter) +#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed +#' @param clean Clean up after run. Default = TRUE +#' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) +#' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) +#' @param band_width "Alignment band width. Default = 20 (CD-HIT parameter)" +#' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) +#' @param word_length Default = 3 (CD-HIT parameter) +#' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) +#' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param alignment_coverage_shorter_p Alignment must cover x percent of shorter sequence. Default = 0 (CD-HIT parameter) +#' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) +#' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) +#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) +#' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) +#' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) +#' @param match Matching score. Default = 2 (CD-HIT parameter) +#' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) +#' @param gap Gap score. Default = -6 (CD-HIT parameter) +#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) +#' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) +#' +#' TODO check whether cdhit is installed +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { + if (missing(input)) { + stop("Input parameter missing!") + } + + message("Loading bed.") + # load bed if neccessary + if (!data.table::is.data.table(input)) { + bed_table <- data.table::fread(input = input, header = FALSE) + } else { + bed_table <- input + } + + # check for duplicated names + if (anyDuplicated(bed_table[, "V4"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, V4 := make.unique(V4)] + } + + message("Convert to fasta.") + ### convert bed to fasta + # 4th column = name + # last column = sequence + fasta_file <- "converted_bed.fasta" + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + message("Clustering.") + ### cd-hit-est + cdhit_output <- "cdhit_output" + cdhit_call <- paste("cd-hit-est -i", fasta_file, + "-o", cdhit_output, + "-c", identity, + "-A", coverage, + "-T", threads, + "-G", global, + "-b", band_width, + "-M", memory, + "-n", word_length, + "-l", throw_away_sequences, + "-s", length_dif_cutoff_shorter_p, + "-S", length_dif_cutoff_shorter_n, + "-aL", alignment_coverage_longer_p, + "-AL", alignment_coverage_longer_n, + "-aS", alignment_coverage_shorter_p, + "-AS", alignment_coverage_shorter_n, + "-uL", max_unmatched_longer_p, + "-uS", max_unmatched_shorter_p, + "-U", max_unmatched_both_n, + "-g", fast_cluster, + "-r", strand, + "-match", match, + "-mismatch", mismatch, + "-gap", gap, + "-gap-ext", gap_ext, + "-sc", sort_cluster_by_size, + "-d 0") + + system(command = cdhit_call, wait = TRUE) + + message("Formatting/ writing results.") + # reformat cluster file + # columns: id, clstr, clstr_size, length, clstr_rep, clstr_iden, clstr_cov + cluster_file <- "reformated.clstr" + cluster_call <- paste("clstr2txt.pl", paste0(cdhit_output, ".clstr"), ">", cluster_file) + + system(command = cluster_call, wait = TRUE) + + # load reformated file + cluster <- data.table::fread(cluster_file) + + ### add cluster to bed_table + result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] + + # delete files + if (clean) { + file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) + } + + data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(cdhitest, args = params) +} diff --git a/bin/compareBed.sh b/bin/compareBed.sh index bcf52fb..aa2c37e 100644 --- a/bin/compareBed.sh +++ b/bin/compareBed.sh @@ -68,6 +68,11 @@ case $key in shift shift ;; + -p|--path) + path="$2" + shift + shift + ;; -h|--help) help=true he=true @@ -96,23 +101,24 @@ if [ $he == true ] then echo "This script utilies bedtools to select new footprints from data." echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information." + echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)" echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." echo "If a parameter is chosen, a value must be provided or an error will occur." echo "--------------------" - echo "usage: compareBed.sh --data \ --motifs \ --fasta \ \[optional_parameter \\] ..." + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." echo "--------------------" echo " required parameter:" echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs the path to the .bed file of the scanned motifs" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" echo " -f --fasta the path to the .fasta file of genome" echo " " echo " optional parameter:" echo " -w --workdir the path to directory where temporary files will be created" - echo " default is a new directory that is created in the current directory" - echo " -min --min minimum size of footprints\; default is 10" - echo " -max --max maximum size of footprints\; default is 60" - echo " -o --output output path/file \; default dir is workdir and filename is newFootprints.bed and newFootprints.bed.fasta" + echo " default is the current directory" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 80" + echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" echo " -h --help shows this help message" exit 0 fi @@ -137,17 +143,17 @@ fi if [ $wo == false ] then - if [ ! -d workdir1337 ] - then - mkdir workdir1337 - fi + #if [ ! -d workdir1337 ] + #then + # mkdir workdir1337 + #fi wo=true - workdir="workdir1337" + workdir=$path fi if [ $ou == false ] then - output="newFootprints.bed" + output=${workdir}/"newMotifs.bed" ou=true fi @@ -159,25 +165,111 @@ fi if [ $ma == false ] then - max=60 + max=80 ma=true fi + +if [ ! -d $workdir ] +then + mkdir $workdir +fi + #1. first filter. no overlap vs. overlap -bedtools intersect -v -a $data -b $motifs > "$workdir"/pass1Tr.bed -bedtools intersect -wa -a $data -b $motifs > "$workdir"/pass1Fa.bed +echo get sequences with no overlap +cat $data > "$workdir"/pass1Tr.bed +help=true + +if [ -d "$motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +else +declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed + else + help=true + bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed + fi + echo $i +done +fi +if [ $help == false ] +then + cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed +fi +echo get overlapping sequences +bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed +cat "$workdir"/pass1Fa.bed | wc -l + +echo calc maxScore #2. compute absolut maxscore position -Rscript --vanilla maxScore.R "$workdir"/pass1Fa.bed -#3. subtract overlapping regions for additional motifs -bedtools subtract -a "$workdir"/pass1Fa.bed -b $motifs > "$workdir"/pass2Tr.bed +Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed + +#3 subtract overlapping regions for additional motifs +#echo "also get subsequences with no overlap of overlapping sequences" +help=true + +if [ -d "$motifs" ] +then +for i in "$motifs"/*.bed +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +else +for i in ${motiffiles[@]} +do + if [ $help == true ] + then + help=false + bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed + else + help=true + bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed + fi + echo $i +done +fi + +if [ $help == false ] +then + cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed +else + cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed +fi + +#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +# also creates a small output file with information about the comparison -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -Rscript --vanilla merge.R $min $max "$workdir" +Rscript --vanilla $path/merge.R $min $max $workdir $data #5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > "$workdir"/"$output" -bedtools getfasta -name -fi $fasta -bed "$workdir"/"$output" -fo "$workdir"/"$output".fasta +bedtools getfasta -fi $fasta -bed "$workdir"/merged.bed -bedOut > $output +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta #6 clean up -rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed \ No newline at end of file +#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed diff --git a/config/celltypes_homo_sapiens.json b/bin/config/celltypes_homo_sapiens.json similarity index 100% rename from config/celltypes_homo_sapiens.json rename to bin/config/celltypes_homo_sapiens.json diff --git a/bin/config/celltypes_mus_musculus.json b/bin/config/celltypes_mus_musculus.json new file mode 100644 index 0000000..79a574c --- /dev/null +++ b/bin/config/celltypes_mus_musculus.json @@ -0,0 +1,153 @@ +[ + { + "type": "Heart", + "alias_ucsc": ["heart"], + "alias_ensembl": ["heart_adult_8_weeks", "heart_embryonic_10_5_days", + "heart_embryonic_11_5_days", "heart_embryonic_12_5_days", + "heart_embryonic_13_5_days", "heart_embryonic_14_5_days", + "heart_embryonic_15_5_days", "heart_embryonic_16_5_days", + "heart_postnatal_0_days"] + }, + { + "type": "Facial_Prominence", + "alias_ucsc": [], + "alias_ensembl": ["embryonic_facial_prominence_embryonic_10_5_days", + "embryonic_facial_prominence_embryonic_11_5_days", + "embryonic_facial_prominence_embryonic_12_5_days", + "embryonic_facial_prominence_embryonic_13_5_days", + "embryonic_facial_prominence_embryonic_14_5_days", + "embryonic_facial_prominence_embryonic_15_5_days"] + }, + { + "type": "Forebrain", + "alias_ucsc": ["forebrain"], + "alias_ensembl": ["forebrain_embryonic_10_5_days", + "forebrain_embryonic_11_5_days", + "forebrain_embryonic_12_5_days", + "forebrain_embryonic_13_5_days", + "forebrain_embryonic_14_5_days", + "forebrain_embryonic_15_5_days", + "forebrain_embryonic_16_5_days", + "forebrain_postnatal_0_days"] + }, + { + "type": "Hindbrain", + "alias_ucsc": ["hindbrain (rhombencephalon)"], + "alias_ensembl": ["hindbrain_embryonic_10_5_days", + "hindbrain_embryonic_11_5_days", + "hindbrain_embryonic_12_5_days", + "hindbrain_embryonic_13_5_days", + "hindbrain_embryonic_14_5_days", + "hindbrain_embryonic_15_5_days", + "hindbrain_embryonic_16_5_days", + "hindbrain_postnatal_0_days" + ] + }, + { + "type": "Midbrain", + "alias_ucsc": ["midbrain (mesencephalon)"], + "alias_ensembl": ["midbrain_embryonic_10_5_days", + "midbrain_embryonic_11_5_days", + "midbrain_embryonic_12_5_days", + "midbrain_embryonic_13_5_days", + "midbrain_embryonic_14_5_days", + "midbrain_embryonic_15_5_days", + "midbrain_embryonic_16_5_days", + "midbrain_postnatal_0_days" + ] + }, + { + "type": "NeuralTube", + "alias_ucsc": ["neural tube"], + "alias_ensembl": ["neural_tube_embryonic_11_5_days", + "neural_tube_embryonic_12_5_days", + "neural_tube_embryonic_13_5_days", + "neural_tube_embryonic_14_5_days", + "neural_tube_embryonic_15_5_days"] + }, + { + "type": "Intestine", + "alias_ucsc": [], + "alias_ensembl": ["intestine_embryonic_14_5_days", + "intestine_embryonic_15_5_days", + "intestine_embryonic_16_5_days", + "intestine_postnatal_0_days"] + }, + { + "type": "Kidney", + "alias_ucsc": [], + "alias_ensembl": ["kidney_adult_8_weeks", + "kidney_embryonic_14_5_days", + "kidney_embryonic_15_5_days", + "kidney_embryonic_16_5_days", + "kidney_postnatal_0_days"] + }, + { + "type": "Liver", + "alias_ucsc": ["liver"], + "alias_ensembl": ["liver_embryonic_11_5_days", + "liver_embryonic_12_5_days", + "liver_embryonic_13_5_days", + "liver_embryonic_14_5_days", + "liver_embryonic_15_5_days", + "liver_embryonic_16_5_days", + "liver_adult_8_weeks", + "liver_postnatal_0_days"] + }, + { + "type": "Limb", + "alias_ucsc": ["limb"], + "alias_ensembl": ["limb_embryonic_10_5_days", + "limb_embryonic_11_5_days", + "limb_embryonic_12_5_days", + "limb_embryonic_13_5_days", + "limb_embryonic_14_5_days", + "limb_embryonic_15_5_days"] + }, + { + "type": "Lung", + "alias_ucsc": [], + "alias_ensembl": ["lung_embryonic_14_5_days", + "lung_embryonic_15_5_days", + "lung_embryonic_16_5_days", + "lung_postnatal_0_days" + ] + }, + { + "type": "Stomach", + "alias_ucsc": [], + "alias_ensembl": ["stomach_embryonic_14_5_days", + "stomach_embryonic_15_5_days", + "stomach_embryonic_16_5_days", + "stomach_postnatal_0_days"] + }, + { + "type": "MEL", + "alias_ucsc": [], + "alias_ensembl": ["MEL_cell_line"] + }, + { + "type": "Spleen", + "alias_ucsc": [], + "alias_ensembl": ["spleen_adult_8_weeks"] + }, + { + "type": "Bruce", + "alias_ucsc": [], + "alias_ensembl": ["ES_Bruce4_embryonic"] + }, + { + "type": "Ganglion", + "alias_ucsc": ["dorsal root ganglion", "trigeminal V (ganglion, cranial)", "cranial nerve"], + "alias_ensembl": [] + }, + { + "type": "Other", + "alias_ucsc": ["nose", "branchial arch", + "ear", "somite", "tail", "eye", + "facial mesenchyme", "other", "melanocytes", + "blood vessels", "genital tubercle"], + "alias_ensembl": [] + } + +] diff --git a/bin/call_peaks.py b/bin/footprints_extraction.py similarity index 100% rename from bin/call_peaks.py rename to bin/footprints_extraction.py diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py index 770c8a8..cc24949 100644 --- a/bin/get_best_motif.py +++ b/bin/get_best_motif.py @@ -3,21 +3,24 @@ def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument("meme", help="Path to meme file") - parser.add_argument("output", help="") + parser.add_argument("output", help="Output file") + parser.add_argument("num", help="Number of motifs parsed from file") args = parser.parse_args() return args - +# write lines of file till certain line (MOTIF + [num]) def main(): args = parse_arguments() out = open(args.output, "w+") + number = int(args.num) + 1 + motif = "MOTIF " + str(number) with open(args.meme) as f: for line in f: - if 'MOTIF 2' in line: + if motif in line: break out.write(line) if __name__ == "__main__": import argparse - main() \ No newline at end of file + main() diff --git a/bin/maxScore.R b/bin/maxScore.R index 4ea545b..6b90984 100644 --- a/bin/maxScore.R +++ b/bin/maxScore.R @@ -1,9 +1,11 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args = commandArgs(TRUE) file = args[1] -tab = read.table(file, header=FALSE) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") +tab = fread(file) +colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") + tab$maxpos = tab$start + tab$maxpos -write.table(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge.R b/bin/merge.R index bbe3c9a..129e41d 100644 --- a/bin/merge.R +++ b/bin/merge.R @@ -1,12 +1,15 @@ -#!/home/jhamp/.conda/envs/tfbs/bin/Rscript +#!/bin/Rscript +library(data.table) args=commandArgs(TRUE) min=as.numeric(args[1]) max=as.numeric(args[2]) folder=args[3] -splitted = read.table(paste(folder, "/pass2Tr.bed", sep=''), header=FALSE) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") -p1 = read.table(paste(folder, "/pass1Tr.bed", sep=''), header=FALSE) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "maxpos", "length") + +splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) +colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") +p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) +colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") + p1$maxpos = p1$start + p1$maxpos splitted=rbind(splitted, p1) @@ -19,5 +22,15 @@ splitted$length=splitted$stop - splitted$start splitted=cbind(splitted, containsMaxpos=0) splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 splitted$maxpos = splitted$maxpos - splitted$start -write.table(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') +data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') + +before = fread(args[4], header=FALSE) +sumb=sum(before$V3-before$V2) +suma=sum(splitted$length) +difference = formatC(sumb/suma, digits=4) +loss = formatC(1 - suma/sumb, digits=2) +lengthb = formatC(mean(before$V3-before$V2), digits=4) +lengtha = formatC(mean(splitted$length), digits=4) +stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1]) +write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R new file mode 100644 index 0000000..03b8cf1 --- /dev/null +++ b/bin/merge_similar_clusters.R @@ -0,0 +1,61 @@ +#!/usr/bin/env Rscript + +# Merging FASTA-files, which motifs are similar. +# +# @parameter tsv_in Path to TSV file generated by Tomtom. +# The input for Tomtom is a from all clusters merged meme-file. +# @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths +# @parameter min_weight Minimum weight of edge allowed in graph clusters. + + +args = commandArgs(trailingOnly = TRUE) + +tsv_in <- args[1] +file_list <- args[2] +min_weight <- args[3] + +files <- unlist(as.list(strsplit(file_list, ","))) + +# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. +tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') +query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ + tail(l,n=1) +})) +target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ + tail(l,n=1) +})) + +# create data.table with only the cluster-numbers +sim_not_unique <- data.table::data.table(query_cluster,target_cluster) +# convert from character to numeric values +sim_not_unique[, query_cluster := as.numeric(query_cluster)] +sim_not_unique[, target_cluster := as.numeric(target_cluster)] + +# remove rows if column 1 is idential to column 2 +edgelist <- sim_not_unique[query_cluster != target_cluster] + +# create graph from data.frame +g <- igraph::graph_from_edgelist(as.matrix(edgelist)) +# converting graph to adjacency matrix +adj_matrix <- igraph::get.adjacency(g, names = T) +# generating weighted graph from adjacency matrix +g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + +# get subgraphs from graph with edges of weight > min_weight +s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) +png('motif_clusters.png') +plot(s1) +dev.off() +clust <- igraph::clusters(s1) +if (clust$no < 1){ + b <- lapply(files, function(f){ + system(paste("cat",f,">",basename(f))) + }) +} +# merge FASTA-files depending on the clustered graphs +a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + cl <- as.vector(which(clust$membership %in% c(i))) + fasta_list <- paste(files[cl], collapse = " ") + name <- paste0("Cluster_",i,".fasta") + system(paste("cat",fasta_list,">",name)) +}) diff --git a/bin/motif_estimation.nf b/bin/motif_estimation.nf new file mode 100644 index 0000000..430040e --- /dev/null +++ b/bin/motif_estimation.nf @@ -0,0 +1,283 @@ + +params.bed = "" +params.out = "" +params.motif_db = "" +params.path_env = "" +params.help = 0 +params.path_bin = "." + +//bed_to_clustered_fasta +params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 + +//glam2 +params.motif_min_key = 8 // Minimum number of key positions (aligned columns) +params.motif_max_key = 20 // Maximum number of key positions (aligned columns) +params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. + +//tomtom +params.tomtom_treshold = 0.01 // threshold for similarity score. + +//cluster motifs +params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not +params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph +motif_similarity_thresh = 0.00001 // threshold for motif similarity score + +if (params.bed == "" || params.out == "" || params.motif_db == "" || params.path_env || "${params.help}" == "1") { +log.info """ +Usage: nextflow run motif_estimation.nf --bed [PATH] --out [PATH] --motif_db [PATH] --path_env YAML-file +--bed Path Input BED-file (Column with cluster: 11; column with sequenze: 10)!! +--out Path Output Path +--motif_db Path Path to MEME-Database +--path_env Path Path to YAML file for conda enviroment. +--path_bin Path Path to directory with subscripts +--help 0|1 Print help message if help == 1 (Default: 0) + +Motif estimation: +--min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) +--motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) +--motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) +--iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) +--tomtom_treshold float Threshold for similarity score. (Default: 0.01) +--best_motif INT Get the best X motifs per cluster. (Default: 3) + +Moitf clustering: +--cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) +--edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) +--motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) +""" +System.exit(2) +} else { + Channel.fromPath(params.bed).map {it -> [it.simpleName, it]}.set {bed_for_motif_esitmation} +} + +/* +Converting BED-File to one FASTA-File per cluster +*/ +process bed_to_clustered_fasta { + conda "${params.path_env}" + publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' + tag{name} + + input: + set name, file (bed) from bed_for_motif_esitmation + + output: + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster + + script: + """ + Rscript ${params.path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} + """ +} + + +//flatten list and adding name of file to channel value +fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} +//combine fasta files in one list +fasta_for_motif_cluster = fasta_for_motif_cluster.toList() + +/* +Running GLAM2 on FASTA-files. +Generating Motifs through alignment and scoring best local matches. +*/ +process glam2 { + + tag{name} + publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' + + input: + set name, file (fasta) from fasta_for_glam2 + + output: + file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_for_filter + file ('*') + + script: + """ + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +/* +Combining all MEME-files to one big MEME-file. +The paths are sorted numerically depending on the cluster number. +*/ +process merge_meme { + + publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + + input: + val (memelist) from meme_to_merge.toList() + + output: + file ('merged_meme.meme') into merged_meme + + when: + params.cluster_motif == 1 + + script: + memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } + meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } + meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") + """ + meme2meme ${meme_list} > merged_meme.meme + """ +} + +/* +Running Tomtom on merged meme-files. +Output table has the information which clusters are similar to each other. +*/ +process find_similar_motifs { + + publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' + input: + file (merged_meme) from merged_meme + + output: + file ('all_to_all.tsv') into motif_similarity + + when: + params.cluster_motif == 1 + + script: + """ + tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv + """ +} + + +files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + + +/* +Merging FASTA-files of similar clusters +*/ +process merge_fasta { + conda "${params.path_env}" + publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' + + input: + set file (motiv_sim), val (fasta_list) from files_for_merge_fasta + + output: + file ('*.fasta') into motif_clustered_fasta_list + file('*.png') + + when: + params.cluster_motif == 1 + + script: + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + """ + Rscript ${params.path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + """ +} + + +motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() + + +process clustered_glam2 { + + publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' + + input: + file (fasta) from motif_clustered_fasta_flat + + output: + set name, file ('*.meme') into clustered_meme_for_tomtom + set name, file ('*.meme') into clustered_meme_for_filter + file('*') + + when: + params.cluster_motif == 1 + + script: + name = fasta.getBaseName() + """ + glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +if(params.cluster_motif == 1){ + for_tomtom = clustered_meme_for_tomtom + for_filter = clustered_meme_for_filter +} else { + for_tomtom = meme_for_tomtom + for_filter = meme_for_filter +} + + +/* +Running Tomtom on meme-files generated by GLAM2. +Tomtom searches motifs in databases. +*/ +process tomtom { + + tag{name} + publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy' + + input: + set name, file (meme) from for_tomtom + + output: + set name, file ('*.tsv') into tsv_for_filter + + script: + """ + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + """ +} + + +//Joining channels with meme and tsv files. Filter joined channel on line count. +//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. +for_filter2 = for_filter.join( tsv_for_filter ) +for_filter2 + .filter { name, meme, tsv -> + long count = tsv.readLines().size() + count <= 1 + } + .into { meme_for_scan; check } + + +//If channel 'check' is empty print errormessage +process check_for_unknown_motifs { + echo true + + input: + val x from check.ifEmpty('EMPTY') + + when: + x == 'EMPTY' + + """ + echo '>>> STOPPED: No unknown Motifs were found.' + """ + +} + + +//Get the best(first) Motif from each MEME-file +process get_best_motif { + conda "${params.path_env}" + + publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' + + input: + set name, file(meme), file(tsv) from meme_for_scan + + output: + set name, file('*_best.meme') into best_motif + + script: + """ + python ${params.path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + """ +} diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R new file mode 100644 index 0000000..2f8e8a2 --- /dev/null +++ b/bin/reduce_bed.R @@ -0,0 +1,259 @@ +#! /bin/Rscript +library("optparse") + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), + make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), + make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), + make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Reduce sequences to frequent regions.") + +opt <- parse_args(opt_parser) + +#' Reduce bed file to conserved regions +#' +#' @param input bed file +#' @param kmer Length of kmer. +#' @param motif Estimated motif length. +#' @param output Output file +#' @param threads Number of threads. Default = 1. 0 for all cores. +#' @param clean Delete all temporary files. +#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 +#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) +#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). +#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. +#' +#' @return reduced bed +#' TODO check whether jellyfish is installed +reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { + # get number of available cores + if (threads == 0) { + threads <- parallel::detectCores() + } + + message("Loading bed...") + # load bed + # columns: chr, start, end, name, ..., sequence + bed_table <- data.table::fread(input = input, header = FALSE) + names(bed_table)[1:4] <- c("chr", "start", "end", "name") + names(bed_table)[ncol(bed_table)] <- "sequence" + # index + data.table::setkey(bed_table, name, physical = FALSE) + + # check for duplicated names + if (anyDuplicated(bed_table[, "name"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, name := make.unique(name)] + } + + # remove sequences below minimum length + if (min_seq_length < max(c(kmer, motif))) { + stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of kmer and motif).") + } + + total_rows <- nrow(bed_table) + bed_table <- bed_table[nchar(sequence) > min_seq_length] + if (total_rows > nrow(bed_table)) { + message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length) + } + + # TODO forward fasta file as parameter so no bed -> fasta conversion is needed. + message("Writing fasta...") + # save as fasta + fasta_file <- paste0(basename(input), ".fasta") + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + message("Counting kmer...") + # count k-mer + hashsize <- 4 ^ kmer + count_output_binary <- "mer_count_binary.jf" + input <- fasta_file + jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input) + + system(command = jellyfish_call, wait = TRUE) + + mer_count_table <- "mer_count_table.jf" + jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary) + + system(command = jellyfish_dump_call, wait = TRUE) + + message("Reduce kmer.") + # load mer table + # columns: kmer, count + kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE) + # order kmer descending + data.table::setorder(kmer_counts, -V2) + + # compute number of hits to keep + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurence = motif_occurence) + + # reduce kmer + reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) + + message("Find kmer in sequences.") + # find k-mer in sequences + # columns: name, start, end, width + ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) + names(ranges_table)[1:2] <- c("relative_start", "relative_end") + + # merge ranged_table with bed_table + keep column order + merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE] + + # delete sequences without hit + merged <- na.omit(merged, cols = c("relative_start", "relative_end")) + message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.") + + message("Reduce sequences.") + # create subsequences + merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] + + # bed files count from 0 + merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] + # change start end location + merged[, `:=`(start = start + relative_start, end = start + relative_end)] + + # clean table + merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] + + if (clean) { + file.remove(fasta_file, count_output_binary, mer_count_table) + } + + data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) +} + +#' Predict how many interesting kmer are possible for the given data. +#' +#' @param bed Bed table with sequences in last column +#' @param kmer Length of kmer +#' @param motif Length of motif +#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' @param motif_occurence Define how many motifs are expected per sequence. Default = 1 +#' +#' @return Number of interesting kmer. +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurence = 1) { + if (minoverlap > kmer || minoverlap > motif) { + stop("Kmer & motif must be greater or equal to minoverlap!") + } + if (motif_occurence <= 0) { + stop("Motif_occurence must be a numeric value above 0!") + } + + # minimum sequence length to get all interesting overlaps + min_seq_length <- motif + 2 * (kmer - minoverlap) + + seq_lengths <- nchar(bed[[ncol(bed)]]) + + # reduce to max interesting length + seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths) + + # calculate max possible kmer + topx <- sum(seq_lengths - kmer + 1) + + return(topx * motif_occurence) +} + +#' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. +#' +#' @param kmer Kmer data.table columns: kmer, count +#' @param significant Value from significant_kmer function. +#' +#' @return reduced data.table +reduce_kmer <- function(kmer, significant) { + data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) + + kmer[, cumsum := cumsum(V2)] + + return(kmer[cumsum <= significant]) +} + +#' create list of significant ranges (one for each bed entry) +#' +#' @param bed Data.table of bed with sequence in last column +#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count. +#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length. +#' @param threads Number of threads. +#' +#' @return Data.table with relative positions and width (start, end, width). +#' +#' TODO Include number of motifs per sequence (aka motif_occurence). Attempt to keep best 2 regions for occurence = 2? Probably high impact on performance. +find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { + if (nchar(kmer_counts[1, 1]) <= minoverlap) { + stop("Minoverlap must be smaller than kmer length!") + } + + names(kmer_counts)[1:2] <- c("kmer", "count") + data.table::setorder(kmer_counts, -count) + + seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { + seq <- bed[x][[ncol(bed)]] + name <- bed[x][[4]] + + #### locate ranges + ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) + + ranges <- na.omit(ranges, cols = c("start", "end")) + + if (nrow(ranges) < 1) { + return(data.table::data.table(start = NA, end = NA, width = NA, name = name)) + } + + # add kmer sequences + ranges[, sub_seq := stringr::str_sub(seq, start, end)] + # add kmer count + ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]] + + #### reduce ranges + reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]]) + + # list of overlapping ranges + edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE)) + + # get components (groups of connected ranges) + graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE) + # vector of node membership (indices correspond to ranges above) + member <- as.factor(igraph::components(graph)$membership) + + # list of membership vectors + node_membership <- lapply(levels(member), function(x) { + which(member == x) + }) + + # calculate component score (= sum of kmer count) + score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) { + sum(kmer_counts[x, "count"]) + }) + + selected_ranges <- node_membership[[which(score == max(score))[1]]] + + # reduce selected ranges + reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) + + reduced_ranges <- data.table::as.data.table(reduced_ranges)[, name := name] + + return(reduced_ranges) + }) + + # create ranges table + conserved_regions_table <- data.table::rbindlist(seq_ranges) + + return(conserved_regions_table) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # show apply progressbar + pbo <- pbapply::pboptions(type = "timer") + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(reduce_bed, args = params) +} diff --git a/bin/tfbsscan.py b/bin/tfbsscan.py new file mode 100644 index 0000000..f17ebf6 --- /dev/null +++ b/bin/tfbsscan.py @@ -0,0 +1,757 @@ +""" +TFBSscan.py produces the data to be used to join the footprint and motif information across genome + +@author: Anastasiia Petrova +@contact: anastasiia.petrova(at)mpi-bn.mpg.de + +""" + +import argparse +import sys +import os +import re +import time +import multiprocessing +import logging +import subprocess +from Bio import SeqIO +import Bio.SeqIO.FastaIO as bio +import textwrap +import MOODS.scan +import MOODS.tools +import MOODS.parsers + +logger = logging.getLogger('mytool') +logger.setLevel(logging.INFO) + +formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") + +fh = logging.FileHandler('final_log.txt') +fh.setLevel(logging.INFO) +fh.setFormatter(formatter) +logger.addHandler(fh) + +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(formatter) +logger.addHandler(ch) + +#catch all the information about input and output files as well as information on the used tool (fimo or moods) +def parse_args(): + parser = argparse.ArgumentParser(prog = 'mytool_ver6', description = textwrap.dedent(''' + + This script takes a list of motifs loaded from jaspar.genereg.net as a combined text file in MEME or .PFM format, a genome file in FASTA format and optionaly a .bed file (the one you want to be merged with the whole genome file) as input. If you want to merge a .bed file with the whole genome file, please enter --bed_file or -b bevor your .bed file. The tool will provide a file output_merge.fa, which you can also use for your research later on. If you already have a merged file, please give this one as genome file input. If there are several motifs in the input file, the tool will create a separate output file for each motif. Choose if you want to use fimo or moods with --use, this script uses by default fimo. Please note that the tool can not provide the calculation of q values with fimo due to the text mode that fimo needs to use. The tool sends merged genome file and motifs to fimo or moods, saves the sorted output for each of the given motifs as moods/fimo_output_[alternate name and id of the motif].txt in the output directory, then calculates the start and the end as real positions on the chromosom and writes this information in the ouput files. The columns in the output file are: chromosom, start, end, the name and score of TF. If a .bed file was given as input, the tool will also add the additional columns from it to the output. If the output file is empty, there were no machtes within given genome regions. Please note, if you want to have all intermediate output files, enter --clean nothing + + '''), epilog='That is what you need to make this script work for you. Enjoy it') + + required_arguments = parser.add_argument_group('required arguments') + + required_arguments.add_argument('-m', '--motifs', help='file in MEME format with mofits loaded from jaspar.genereg.net') + required_arguments.add_argument('-g', '--genome', help='a whole genome file or regions of interest in FASTA format to be scanned with motifs') + + #all other arguments are optional + parser.add_argument('-o', '--output_directory', default='output', const='output', nargs='?', help='output directory, default ./output/') + parser.add_argument('-b', '--bed_file', nargs='?', help='a .bed file to be merged with the whole genome file to find regions of interest') + parser.add_argument('--use', '--use_tool', default='fimo', const='fimo', nargs='?', choices=['fimo', 'moods'], help='choose the tool to work with, default tool is fimo') + parser.add_argument('--clean', nargs='*', choices=['nothing', 'all', 'cut_motifs', 'fimo_output', 'merge_output', 'moods_output'], dest='cleans', help='choose the files you want to delete from the output directory, the default is deleting all the temporary files from the directory') + parser.add_argument('--fimo', help='enter additional options for fimo using = inside "", for example fimo="--norc" to not score the reverse complement DNA strand. By default the --text mode is used and the calculation of the q values due to the --text mode is not possible') + parser.add_argument('--cores', type=int, help='number of cores allowed to use by this tool, by default the tool uses 2 cores', default=2) + parser.add_argument('-p', '--p_value', type=float, help='enter the p value, the default p value is 0.0001. Please note that if you enter the p value using --fimo="--thresh ..." as well, the one within --fimo call will be used', default=0.0001) + parser.add_argument('--resolve_overlaps', action='store_true', help='delete overlaps with greater p value, by default no overlaps are deleted') + parser.add_argument('--hide_info', action='store_true', help='while working with data write the information only into ./log.txt') + parser.add_argument('--moods_bg', nargs='+', type=float, help='set the bg for moods, by default moods uses the bg is 0.25 0.25 0.25 0.25') + + args = parser.parse_args() + return args + +def check_directory(directory): + if not os.path.exists(directory): + os.makedirs(directory) + logger.info('a new directory ' + directory + ' was created') + +#merge the whole genome with the regions mentioned in .bed file +def merge(genome, bed_file, output_directory): + logger.info('the merging of files ' + genome + ' and ' + bed_file + ' will end soon, the result file is output_merge.fa') + output_merge = os.path.join(output_directory, "output_merge.fa") + os.system("bedtools getfasta -fi " + genome + " -bed " + bed_file + " -fo " + output_merge) + return output_merge + +#split the motifs each in other file +def split_motifs(motifs, output_directory, usage): + logger.info("the file with motifs " + motifs + " will be checked for motifs and if needed splitted in files each containing only one motif") + + first_line = subprocess.getoutput("head -1 " + motifs) #find the first line of the input file + + if usage == "moods": + if first_line.startswith(">"): + #the motif file probably has the .pfm format, try to read and split it + splitted_motifs = read_pfm(motifs, output_directory) + else: #maybe the file with motifs is in MEME format, so try to convert it + logger.info("the input file has not the expected format, I will try to convert it to .pfm format") + splitted_motifs = convert_meme_to_pfm(motifs, output_directory) + + elif usage == "fimo": + if first_line.startswith("MEME version"): + #the motifs file has probably the MEME format, try to read and split it + splitted_motifs = read_meme(motifs, output_directory) + + #if the there was a convertion before, delete all the .pfm files as we don't need them + for filename in os.listdir(output_directory): + if filename.endswith(".pfm"): + remove_file(os.path.join(output_directory, filename)) + + else: #maybe the file with motifs is in .pfm format, so try to convert is + logger.info("the input file has not the expected format, I will try to convert it to MEME format") + splitted_motifs = convert_pfm_to_meme(motifs, output_directory) + + return splitted_motifs + +def read_pfm(motifs, output_directory): + splitted_motifs = [] #to save the names of files after splitting + motif = [] #to save the motif itself, which will be written to the file + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + #as the motif has first line with the name and 4 lines with information, if the 5th line is something else than the name of the next motif, the exit will be forced + if lines == 5 and not line.startswith(">"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + else: + if line.startswith(">"): + if 'written_file' in locals(): + written_file.write(''.join(motif)) + motif = [] + lines = 0 + written_file.close() + + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_id = re.split(' ', line[1:])[0] #[1:] meands do not use the first character + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') + + splitted_motifs.append(motif_name) + written_file = open(motif_name, 'w') + + if lines >= 1 and lines <= 4: #one motif has 5 lines, the first consists the name, the next 4 - the information we need to proceed the data within moods + motif.append(line) + + lines = lines + 1 + written_file.write(''.join(motif)) + written_file.close() + + return splitted_motifs + +def read_meme(motifs, output_directory): + splitted_motifs = [] #to save the names of files after splitting + motif = [] #to save the motif itself, which will be written to the file + head = [] #define a list for header, as fimo needs a header in each motif file it proceedes + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + #make the head part + if lines <= 8: + if lines == 0 and not line.startswith("MEME version"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + + head.append(line) + else: + #search for motifs and save each to another file + if line.startswith("MOTIF"): + if 'written_file' in locals(): + written_file.write(''.join(motif)) + motif = [] + written_file.close() + + #the alternate name will be checked for validity and the invalid chars will be replaced with '_' + if len(re.split(' ', line.rstrip())) == 3: #in the input motif file the motif id and the alternate name are splitted using the tab, otherwise they are splitted using _, but we do not want to change it if so + motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) + motif_id = re.split(' ', line)[1] + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.meme') + else: + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_name = os.path.join(output_directory, motif_alternate_name + '.meme') + + #make a list with all the motif names to know which files to iterate when fimo is called + splitted_motifs.append(motif_name) + + written_file = open(motif_name, 'w') + written_file.write(''.join(head)) + + motif.append(line) + + lines = lines + 1 + + #write the last motif + written_file.write(''.join(motif)) + written_file.close() + + read_file.close() + + return splitted_motifs + +def convert_meme_to_pfm(motifs, output_directory): + #i can only convert the file to pfm if the motifs file is in MEME format + + splitted_motifs = [] #to save the names of files after splitting + rows = [[] for row in range(4)] + + with open(motifs) as read_file: + lines = 0 + for line in read_file: + if lines == 0 and not line.startswith("MEME version"): + logger.info('please make sure that the file with motifs has a right format and the number of lines is right in the motif file') + sys.exit() + else: + #search for motifs and save each to another file + if line.startswith("MOTIF"): + + if 'written_file' in locals(): + for row in rows: + written_file.write('\t'.join(row) + '\n') + + rows = [[] for row in range(4)] + + written_file.close() + + #the alternate name will be checked for validity and the invalid chars will be replaced with '_' + if len(re.split(' ', line.rstrip())) == 3: #in the input motif file the motif id and the alternate name are splitted using the tab, otherwise they are splitted using _, but we do not want to change it if so + motif_alternate_name = check_name(re.split(' ', line)[2].rstrip()) + motif_id = re.split(' ', line)[1] + motif_name = os.path.join(output_directory, motif_alternate_name + '_' + motif_id + '.pfm') + + else: + motif_alternate_name = check_name(re.split(' ', line)[1].rstrip()) + motif_name = os.path.join(output_directory, motif_alternate_name + '.pfm') + + #make a list with all the motif names to know which files to iterate when fimo is called + splitted_motifs.append(motif_name) + + written_file = open(motif_name, 'w') + + elif line.startswith("letter-probability matrix"): + columns = int(re.split(' ', re.split('w= ', line)[1])[0]) #find the number of columns from the line out of the motifs file + nsites = int(re.split(' ', re.split('nsites= ', line)[1])[0]) #find the nsites to count the frequency count for .pfm file + + elif line.startswith(' '): #each line with information about frequency starts in MEME format with ' ' + for i in range(len(rows)): + rows[i].append(str(round(float(re.findall(r'\S+', line)[i])*nsites))) #split the line, do not mention how much whitespaces are in between, multiply it with nsites and save it to the corresponding row + + lines = lines + 1 + + #write the last motif + for row in rows: + written_file.write('\t'.join(row) + '\n') + + written_file.close() + read_file.close() + + return splitted_motifs + +def convert_pfm_to_meme(motifs, output_directory): + #i can only convert the file to meme, if motifs file is in .pfm format + + #first we need to split the pfm motifs as the jaspar2meme does not work with the files containing several motifs, but with the directory consisting files each with only one motif in pfm format + pfm_motifs = read_pfm(motifs, output_directory) + + converted_name = os.path.join(output_directory, "converted_motifs.meme") + + os.system("jaspar2meme -pfm " + output_directory + " > " + converted_name) + + #need to call split motifs for meme file + splitted_motifs = split_motifs(converted_name, output_directory, "fimo") + + remove_file(converted_name) + return splitted_motifs + +#if there are chars that are not allowed, they will be replaced with '_', to the possibly invalid names there will be added '_' at the beginning of the name +def check_name(name_to_test): + badchars= re.compile(r'[^A-Za-z0-9_. ]+|^\.|\.$|^ | $|^$') + badnames= re.compile(r'(aux|com[1-9]|con|lpt[1-9]|prn)(\.|$)') + + #replace all the chars that are not allowed with '_' + name = badchars.sub('_', name_to_test) + + #check for the reserved by the os names + if badnames.match(name): + name = '_' + name + return name + +#use fimo to make a file +def call_fimo(fimo_data, p_value, one_motif, genome, output_directory): + + #make the filename for the fimo output + fimo_output_file = os.path.join(output_directory, "fimo_output_" + os.path.splitext(os.path.basename(one_motif))[0] + ".txt") + fimo_output_unsorted = os.path.join(output_directory, "fimo_output_unsorted_" + os.path.splitext(os.path.basename(one_motif))[0] + ".txt") + + #check if user needs special options for the fimo + if fimo_data != None: + fimo_data = fimo_data + " --thresh " + str(p_value) + " " + else: + fimo_data = "--thresh " + str(p_value) + " " #add the passed p value to the fimo options + + #call fimo for this motif and save the output to a temporary file + send_to_fimo = "fimo --text --no-qvalue " + fimo_data + one_motif + " " + genome + " > " + fimo_output_unsorted + + logger.info('fimo proceed the data using this call ' + send_to_fimo) + + fimo_stdout = subprocess.getoutput(send_to_fimo) + + #the normal output from fimo starts with Using motif ..., so print it from the logger, otherwise print what else fimo says + if fimo_stdout.startswith("Using") and re.split('\n', fimo_stdout)[1]: + logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[0].rstrip()) + logger.info('info from fimo: ' + re.split('\n', fimo_stdout)[1].rstrip()) + else: #there were some problems with fimo, so we want to see what they were + logger.info('info from fimo: ' + fimo_stdout) + + + if not os.path.isfile(fimo_output_unsorted): + logger.info('the usage of fimo was crashed, there is no required output file, the exit is forced') + sys.exit() + + if os.stat(fimo_output_unsorted).st_size == 0: #if the output of fimo is empty + fimo_output_unsorted = fimo_output_unsorted.replace('unsorted_', '') + return fimo_output_unsorted + else: + #if the file was converted from pfm, the second column contains the positions, so we want to sort using this column, and not the next one + tmp_path = os.path.join(output_directory, "tmp/") + second_line = subprocess.getoutput("sed -n '2{p;q}' " + fimo_output_unsorted) + if re.split('\t', second_line)[2].startswith("chr"): #the re.split[1] is a ' ', so take the [2] + os.system("cat " + fimo_output_unsorted + " | sort -T " + tmp_path + " -k 2 -V > " + fimo_output_file) + else: + #we are sorting after the third column, which looks like chr1:123-126, -V means it will sort the digitals and not the strings + os.system("cat " + fimo_output_unsorted + " | sort -T " + tmp_path + " -k 3 -V > " + fimo_output_file) + + #make sure the output of fimo exists + if not os.path.isfile(fimo_output_file): + logger.info('the sorting of the output file from the fimo was crashed, the exit is forced') + sys.exit() + else: + return fimo_output_file + +def call_moods(one_motif, genome, output_directory, p_value, moods_bg): + + # setting standard parameters for moods + pseudocount = 0.0001 + + if moods_bg == None: + bg = MOODS.tools.flat_bg(4) + else: + bg = tuple(moods_bg) + + logger.info("moods will work with the p_value " + str(p_value) + " and the bg " + str(bg)) + + motif_name = os.path.basename(one_motif) + + moods_output_unsorted_name = os.path.join(output_directory, "moods_output_unsorted_" + os.path.splitext(motif_name)[0] + ".txt") + moods_output_file_unsorted = open(moods_output_unsorted_name, 'w') + + moods_output_name = os.path.join(output_directory, "moods_output_" + os.path.splitext(motif_name)[0] + ".txt") + moods_output_file = open(moods_output_name, 'w') + + matrix_names = [os.path.basename(one_motif)] + + matrices = [] + matrices_rc = [] + + valid, matrix = pfm_to_log_odds(one_motif, bg, pseudocount) + + key_for_bed_dict = '' + + if valid: + + logger.info("please be patient, moods is working on the data") + + matrices.append(matrix) + matrices_rc.append(MOODS.tools.reverse_complement(matrix,4)) + matrices_all = matrices + matrices_rc + thresholds = [MOODS.tools.threshold_from_p(m, bg, p_value, 4) for m in matrices_all] + + scanner = MOODS.scan.Scanner(7) + scanner.set_motifs(matrices_all, bg, thresholds) + + with open(genome) as handle: + + seq_iterator = bio.SimpleFastaParser(handle) + + for header, seq in seq_iterator: + + header_splitted = re.split(r':', header) + + if len(header_splitted) == 1: #if there are no positions given + header = header + ":0-" #set the first position as 0 and split it once more + header_splitted = re.split(r':', header) + logger.info("moods works with " + header) + else: #the given genome file is a file with peaks, so use the header of the peak as a key to search in the bed dictionary for additional information later on + key_for_bed_dict = header + + chromosom = header_splitted[0] + positions = re.split(r'-', header_splitted[-1]) + + results = scanner.scan(seq) + + fr = results[:len(matrix_names)] #forward strand + rr = results[len(matrix_names):] #reverse strand + + results = [[(r.pos, r.score, '+', ()) for r in fr[i]] + + [(r.pos, r.score, '-', ()) for r in rr[i]] for i in range(len(matrix_names))] #use + and - to indicate strand + + for (matrix, matrix_name, result) in zip(matrices, matrix_names, results): + + motif_id = re.split(r'_', matrix_name)[-1] #find the id of the given morif + motif_alternate_name = matrix_name.replace(motif_id, '')[:-1] #the alternate name of the motif is the name of the file without id and with cutted last character, that is _ + + if len(matrix) == 4: + l = len(matrix[0]) + if len(matrix) == 16: + l = len(matrix[0] + 1) + for r in sorted(result, key=lambda r: r[0]): + strand = r[2] + pos = r[0] + hitseq = seq[pos:pos+l] #sequence + #score = r[1] + score = format(r[1], '.15f') #round to 15 digits after floating point, already type str + + if key_for_bed_dict != '': + start = pos + 1 + end = pos + len(hitseq) + chromosom = key_for_bed_dict #instead of only the name of chromosom write the key to search in the bed_file + else: + start = int(positions[0]) + pos + 1 + end = start + len(hitseq) - 1 + + #moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, str(score)]) + '\n') + moods_output_file_unsorted.write('\t'.join([motif_id, motif_alternate_name, chromosom, str(start), str(end), strand, score]) + '\n') + + #now sort the output of moods + tmp_path = os.path.join(output_directory, "tmp/") + os.system("cat " + moods_output_unsorted_name + " | sort -T " + tmp_path + " -k 1 -V > " + moods_output_name) + + moods_output_file_unsorted.close() + moods_output_file.close() + + return moods_output_name + + else: + logger.info("The input for moods was not validated by the MOODS.parsers.pfm. Please check if it has the right format (note that the MOODS accepts only the old version of .pfm files, that is one without the header containing the name and id of the motif)") + sys.exit() + +#help function for the moods call, convert pfm to log odds +def pfm_to_log_odds(filename, bg, pseudocount): + if pfm(filename): + mat = MOODS.parsers.pfm_to_log_odds(filename, bg, pseudocount) + if len(mat) != 4: #if something went wrong, the empty list will be returned + return False, mat + else: + return True, mat + else: + logger.info('please make sure the motif file has a .pfm format needed for moods') + sys.exit() + +#help function for the moods call, check if the file is in a pfm format using moods +def pfm(filename): + mat = MOODS.parsers.pfm(filename) + if len(mat) != 4: + return False + else: + return True + +# calculate the real positions of TFs, if needed, resolve the overlaps, and write to the output file +def write_output_file(input_file, bed_dictionary, resolve_overlaps): + + if os.path.basename(input_file).startswith("moods"): + name_without_moods_or_fimo = input_file.replace('moods_output_', '') + used_tool = "moods" + else: + name_without_moods_or_fimo = input_file.replace('fimo_output_', '') + used_tool = "fimo" + + output_file_name = os.path.splitext(name_without_moods_or_fimo)[0] + ".bed" + + logger.info('writing the output file ' + output_file_name) + output_file = open(output_file_name, 'w') + + #calculate the real positions of TFs and write the information in the output file + with open(input_file) as read_file: + + overlap = [] + printed_line = [] + last_line = [] + key_for_bed_dict = '' + + for line in read_file: + if not line.startswith('#'): + + line_to_write = [] + + line_array = re.split(r'\t', line.rstrip('\n')) + + chromosom_and_positions = re.split(r':', line_array[2]) + if len(chromosom_and_positions) == 1: #the whole genome was given, there is nothing to split + chromosom = line_array[2] + start = line_array[3] + end = line_array[4] + else: + positions = re.split(r'-', chromosom_and_positions[-1]) + chromosom = chromosom_and_positions[0] + start = str(int(positions[0]) + int(line_array[3])) + end = str(int(positions[0]) + int(line_array[4])) + key_for_bed_dict = line_array[2] #use only if there is a bed_dictionary in input + + #------- these are 5 needed columns to succesfully proceed the data + + name = os.path.splitext(os.path.basename(name_without_moods_or_fimo))[0] + + score = line_array[6] + + line_to_write.extend([chromosom, start, end, name, score]) + #------ here the additional information coule be added to the output file + + strand_inf = line_array[5] + line_to_write.append(strand_inf) + + if used_tool == "fimo": + p_value = line_array[7] + line_to_write.append(p_value) + + #if the dictionary is not empty check for the information corresponding to these positions + if bed_dictionary and key_for_bed_dict in bed_dictionary: + line_to_write.append('\t'.join(bed_dictionary[key_for_bed_dict])) + + line_to_write.insert(0, "write") #insert a flag to know if the line should be written or not + + last_line = line_to_write #save the line in case it is the last line and if due to the check_overlap it could not be printed + + if resolve_overlaps: + overlap, line_to_write, printed_line = check_overlap(line_to_write, overlap, printed_line, output_file) + + write_line_not_overlap(output_file, line_to_write) + + if not last_line[0].startswith('write'): #it could be that the write flag was deleted from the last_line so append it back + overlap.insert(0, "write") + + #if there is already any printed line, check if the saved last line was already printed. Otherwise print it + if resolve_overlaps: + if printed_line: + if last_line[1] != printed_line[1] or last_line[2] != printed_line[2]: + write_line_not_overlap(output_file, last_line) + + output_file.close() + +def write_line_not_overlap(output_file, line_to_write): + if line_to_write: #if line_to_write is not empty + if line_to_write[0].startswith('write'): #if the line does not contain an overlap, it contains the flag "write" at the first position + line_to_write.pop(0) #delete the flag + output_file.write('\t'.join(line_to_write) + '\n') + +def check_overlap(line_to_write, overlap, printed_line, output_file): + + is_overlap = None + + if not overlap: #if the overlap list is empty + is_overlap = False + + else: #if the overlap list is not empty + + if not overlap[0].startswith('write'): #it could be that the write flag was deleted from the overlap so append it back to make sure the next if clauses run right + overlap.insert(0, "write") + + if line_to_write[1] == overlap[1] and float(line_to_write[2]) < float(overlap[3]): #the current line could overlap the previous because the start of the current line is smaller than the end of the previous one and they are both on the one chromosom + #if p value in the current line is smaller than p value in the previous line (or score is bigger), save the current line as possible overlap for future + if float(line_to_write[5]) > float(overlap[5]): + is_overlap = False + else: + #if the p value in the current line is greater than p value in the previous line or are these both p values the same, the current line will not be printed, but also it will not be saved + line_to_write.pop(0) + is_overlap = None + else: #it is an other chromosom or the start at the current line is greater or the same as the end of the previous one + if printed_line != overlap: + is_overlap = True + else: + is_overlap = False + + if is_overlap == False: + overlap = line_to_write #save the current line + line_to_write.pop(0) #do not print anything due to deleting the flag ("write") + elif is_overlap == True: + if not overlap[0].startswith('write'): + overlap.insert(0, "write") + printed_line = overlap #the previous line is saved as temporary line to print it later on + overlap = line_to_write #save the current line + line_to_write = printed_line #print the temporary saved line + + return overlap, line_to_write, printed_line + +def remove_file(file): + if os.path.isfile(file): + os.remove(file) + +def clean_directory(cleans, output_directory, motif, tool_output_file): + + fimo_output_unsorted = os.path.join(tool_output_file.replace("fimo_output", "fimo_output_unsorted")) + moods_output_unsorted = os.path.join(tool_output_file.replace("moods_output", "moods_output_unsorted")) + + for clean in cleans: + if clean == 'all': + remove_file(motif) + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + remove_file(moods_output_unsorted) + elif clean == 'fimo_output': + if os.path.basename(tool_output_file).startswith("fimo"): + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + elif clean == 'cut_motifs': + remove_file(motif) + elif clean == 'moods_output': + if os.path.basename(tool_output_file).startswith("moods"): + remove_file(tool_output_file) + remove_file(fimo_output_unsorted) + +def tool_make_output(usage, motif, genome, output_directory, cleans, p_value, bed_dictionary, fimo_data, resolve_overlaps, moods_bg): + try: + if usage == "fimo": + tool_output_file = call_fimo(fimo_data, p_value, motif, genome, output_directory) + elif usage == "moods": + tool_output_file = call_moods(motif, genome, output_directory, p_value, moods_bg) + + output = write_output_file(tool_output_file, bed_dictionary, resolve_overlaps) + finally: + clean_directory(cleans, output_directory, motif, tool_output_file) + +def multiprocess(motifs, genome, output_directory, cleans, fimo_data, p_value, bed_dictionary, cpu_count, resolve_overlaps, usage, moods_bg): + + if cleans == None: + cleans = ['all'] + + pool = multiprocessing.Pool(cpu_count) #by default is cpu_count 2 + + length = len(motifs) #the number of the motifs to find the percentage of the job that was done + step = 100/length #the percentage that should be added after the job with each single motif is done + + tasks = [] #here the jobs done by processes are saved + + for motif in motifs: + tasks.append(pool.apply_async(tool_make_output, args = (usage, motif, genome, output_directory, cleans, p_value, bed_dictionary, fimo_data, resolve_overlaps, moods_bg, ))) + + tasks_done = sum([task.ready() for task in tasks]) #the number of the processes that ended their job + + #check the number of the processes that are ready till the number of them reaches the number of processes started in the pool + while tasks_done < len(tasks): + #if the number of ready processes has changed, save the new number and print the percentage of the job done + if sum([task.ready() for task in tasks]) != tasks_done: + tasks_done = sum([task.ready() for task in tasks]) + sys.stdout.write("%-100s| %d%% \r" % (''*tasks_done, step*tasks_done)) + sys.stdout.flush() + sys.stdout.write("\n") + #update the number of ready processes each 0.05 seconds + time.sleep(0.05) + + pool.close() + pool.join() #make sure all the processes are done and exit + + #the processes should not delete the merged genome file + #so make sure if this file is needed, otherwise delete it + for clean in cleans: + if clean == 'all' or clean == 'merge_output': + for filename in os.listdir(output_directory): + if filename == "output_merge.fa": + remove_file(genome) + + if clean != 'nothing': + logger.info('the directory ' + output_directory + ' was cleaned, only the required files were left') + +def make_bed_dictionary(bed_file): + + bed_dictionary = {} + + with open(bed_file) as read_bed_file: + for bed_line in read_bed_file: + bed_line_array = re.split(r'\t', bed_line.rstrip('\n')) + if bed_line_array[1].isdigit() and bed_line_array[2].isdigit() and int(bed_line_array[1]) <= int(bed_line_array[2]): #in the real bedfile the second column is a start position, and the third column is an end position, so we are checking if these are integers and if the start position is smaller than the end one + key = bed_line_array[0] + ":" + bed_line_array[1] + "-" + bed_line_array[2] + value = [] + for i in range(3, len(bed_line_array)): + value.append(bed_line_array[i]) + + bed_dictionary[key] = value + else: #this is not a bed file, force exit + logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line) + sys.exit() + + read_bed_file.close() + + return bed_dictionary + +def is_fasta(check_fasta): + if not os.path.isfile(check_fasta): + logger.info('there is no file with genome, the exit is forced') + sys.exit() + else: + # modified code from https://stackoverflow.com/questions/44293407/how-can-i-check-whether-a-given-file-is-fasta + + with open(check_fasta, "r") as handle: + fasta = SeqIO.parse(handle, "fasta") + return any(fasta) # False when `fasta` is empty, i.e. wasn't a FASTA file + +def check_existing_input_files(args): + if not args.motifs or not args.genome: + logger.info('there is no satisfied input, please enter --help or -h to learn how to use this tool') + sys.exit() + elif not is_fasta(args.genome): + logger.info('please make sure the input genome file has a fasta format') + sys.exit() + #check if the file with motifs exists + elif not os.path.isfile(args.motifs): + logger.info('there is no file with motifs, the exit is forced') + sys.exit() + #if the bedfile was given as input, check if this file exists + elif args.bed_file: + if not os.path.isfile(args.bed_file): + logger.info('there is no such bed file ' + args.bed_file + ', the exit is forced') + sys.exit() + +def check_fimo_version(): + fimo_version = subprocess.getoutput("fimo --version") #works only on python 3.4 + #fimo_version = int(fimo_version.replace('.', '')) #replace the . in the version to be able to compare it as int + if fimo_version < "4.12.0": + logger.info('please make sure you are using fimo version 4.12.0 or the newer one') + sys.exit() + +def main(): + + + args = parse_args() + + if args.use == "fimo": + check_fimo_version() + + #if user do not want to see the information about the status of jobs, remove the handler, that writes to the terminal + if args.hide_info: + #logger.disabled = True + logger.removeHandler(ch) + + check_existing_input_files(args) + + #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user + check_directory(args.output_directory) + + splitted_motifs = split_motifs(args.motifs, args.output_directory, args.use) + + #check if there is a .bed file to merge the genome file with. If so, merge them + if args.bed_file: + bed_dictionary = make_bed_dictionary(args.bed_file) + args.genome = merge(args.genome, args.bed_file, args.output_directory) + else: + bed_dictionary = {} + + #if the usage is moods, call moods, otherwise call fimo + multiprocess(splitted_motifs, args.genome, args.output_directory, args.cleans, args.fimo, args.p_value, bed_dictionary, args.cores, args.resolve_overlaps, args.use, args.moods_bg) + + for handler in logger.handlers: + handler.close() + logger.removeFilter(handler) + +if __name__ == "__main__": + main() diff --git a/config/celltypes_mus_musculus.json b/config/celltypes_mus_musculus.json deleted file mode 100644 index 0f02cde..0000000 --- a/config/celltypes_mus_musculus.json +++ /dev/null @@ -1,179 +0,0 @@ -[ - { - "type": "A549", - "alias_ucsc": [], - "alias_ensembl": ["A549"] - }, - { - "type": "Aorta", - "alias_ucsc": ["branchial arch"], - "alias_ensembl": ["Aorta"] - }, - { - "type": "Thymus", - "alias_ucsc": [], - "alias_ensembl": ["Fetal_Thymus", "Thymus"] - }, - { - "type": "B-Cells", - "alias_ucsc": [], - "alias_ensembl": ["B_cells_PB_Roadmap", "naive_B_cell_VB", "GM12878"] - }, - { - "type": "T-Cell", - "alias_ucsc": [], - "alias_ensembl": ["CD4_ab_T_cell_VB", "CM_CD4_ab_T_cell_VB", "CD8_ab_T_cell_CB", "T_cells_PB_Roadmap"] - }, - { - "type": "Monocyte", - "alias_ucsc": [], - "alias_ensembl": ["CD14CD16__monocyte_CB", "CD14CD16__monocyte_VB", "Monocytes_CD14", "Monocytes_CD14_PB_Roadmap"] - }, - { - "type": "Neutrophil", - "alias_ucsc": [], - "alias_ensembl": ["neutrophil_CB", "neutrophil_myelocyte_BM", "neutrophil_VB"] - }, - { - "type": "Eosinophil", - "alias_ucsc": [], - "alias_ensembl": ["eosinophil_VB"] - }, - { - "type": "Macrophage", - "alias_ucsc": [], - "alias_ensembl": [ - "M0_macrophage_CB", - "M0_macrophage_VB", - "M1_macrophage_CB", - "M1_macrophage_VB", - "M2_macrophage_CB", - "M2_macrophage_VB" - ] - }, - { - "type": "Erythroblast", - "alias_ucsc": [], - "alias_ensembl": ["erythroblast_CB"] - }, - { - "type": "Intestine", - "alias_ucsc": [], - "alias_ensembl": ["Fetal_Intestine_Large", "Fetal_Intestine_Small", "Small_Intestine"] - }, - { - "type": "AdrenalGland", - "alias_ucsc": [], - "alias_ensembl": ["Fetal_Adrenal_Gland"] - }, - { - "type": "Muscle", - "alias_ucsc": ["limb"], - "alias_ensembl": ["Fetal_Muscle_Leg", "Fetal_Muscle_Trunk", "Psoas_Muscle", "HSMM", "HSMMtube"] - }, - { - "type": "Gastric", - "alias_ucsc": [], - "alias_ensembl": ["Fetal_Stomach", "Gastric"] - }, - { - "type": "Endothelial", - "alias_ucsc": ["blood vessels"], - "alias_ensembl": ["EPC_VB", "HMEC", "HUVEC", "HUVEC_prol_CB", "NHEK"] - }, - { - "type": "StemCells", - "alias_ucsc": [], - "alias_ensembl": ["H1ESC", "H1_mesenchymal", "H1_neuronal_progenitor", "H1_trophoblast", "H9", "MSC_VB", "iPS_20b", "iPS_DF_6_9", "iPS_DF_19_11"] - }, - { - "type": "Lung", - "alias_ucsc": [], - "alias_ensembl": ["Lung", "IMR90", "NHLF"] - }, - { - "type": "Pancreas", - "alias_ucsc": ["pancreas"], - "alias_ensembl": ["Pancreas"] - }, - { - "type": "Liver", - "alias_ucsc": ["liver"], - "alias_ensembl": [] - }, - { - "type": "Ovary", - "alias_ucsc": [], - "alias_ensembl": ["Ovary"] - }, - { - "type": "Placenta", - "alias_ucsc": [], - "alias_ensembl": ["Placenta"] - }, - { - "type": "Spleen", - "alias_ucsc": [], - "alias_ensembl": ["Spleen"] - }, - { - "type": "Heart", - "alias_ucsc": ["heart"], - "alias_ensembl": ["Right_Atrium", "Left_Ventricle"] - }, - { - "type": "Osteoblast", - "alias_ucsc": [], - "alias_ensembl": ["Osteobl"] - }, - { - "type": "Fibroblast", - "alias_ucsc": [], - "alias_ensembl": ["NHDF_AD"] - }, - { - "type": "NK-Cells", - "alias_ucsc": [], - "alias_ensembl": ["Natural_Killer_cells_PB"] - }, - { - "type": "Cancers", - "alias_ucsc": [], - "alias_ensembl": ["HeLa_S3", "HepG2", "DND_41", "K562"] - }, - { - "type": "Brain", - "alias_ucsc": ["midbrain (mesencephalon)", "trigeminal V (ganglion, cranial)", "forebrain", "neural tube", "hindbrain (rhombencephalon)", "dorsal root ganglion", "cranial nerve"], - "alias_ensembl": ["NH_A"] - }, - { - "type": "Mesenchym", - "alias_ucsc": ["mesenchyme derived from neural crest", "facial mesenchyme"], - "alias_ensembl": [] - }, - { - "type": "Embryonal", - "alias_ucsc": ["somite", "genital tubercle"], - "alias_ensembl": [] - }, - { - "type": "Eye", - "alias_ucsc": ["eye"], - "alias_ensembl": [] - }, - { - "type": "Nose", - "alias_ucsc": ["nose"], - "alias_ensembl": [] - }, - { - "type": "Tail", - "alias_ucsc": ["tail"], - "alias_ensembl": [] - }, - { - "type": "Melanocytes", - "alias_ucsc": ["melanocytes"], - "alias_ensembl": [] - } -] diff --git a/config/cluster.config b/config/cluster.config index e69de29..1ba0b33 100644 --- a/config/cluster.config +++ b/config/cluster.config @@ -0,0 +1,15 @@ +params{ + //reduce_bed + kmer=10 + aprox_motif_len=10 + motif_occurence=1 + min_seq_length=10 + + //cdhit_wrapper + global=0 + identity=0.8 + sequence_coverage=8 + memory=2000 + throw_away_seq=9 + strand=0 +} diff --git a/config/create_gtf.config b/config/create_gtf.config index d1bbb12..c0718a6 100644 --- a/config/create_gtf.config +++ b/config/create_gtf.config @@ -1,4 +1,4 @@ params{ - organism="homo_sapiens" + organism="hg38" tissue="" } diff --git a/config/filter_unknown_motifs.config b/config/filter_unknown_motifs.config new file mode 100644 index 0000000..79033c4 --- /dev/null +++ b/config/filter_unknown_motifs.config @@ -0,0 +1,4 @@ +params{ + min_size_fp=10 + max_size_fp=100 +} diff --git a/config/motif_estimation.config b/config/motif_estimation.config index a00fc86..2445d12 100644 --- a/config/motif_estimation.config +++ b/config/motif_estimation.config @@ -1,11 +1,18 @@ params { //bed_to_clustered_fasta min_seq = 10 - + //glam2 - motif_min_len = 10 + motif_min_len = 8 motif_max_len = 20 + interation = 10000 //tomtom tomtom_treshold = 0.01 -} \ No newline at end of file + + //motif Clustering + cluster_motif = 0 + edge_weight = 50 + motif_similarity_thresh = 0.00001 + best_motif = 3 +} diff --git a/uropa.config b/config/uropa.config similarity index 100% rename from uropa.config rename to config/uropa.config diff --git a/masterenv.yml b/masterenv.yml index 175f53a..211f4e1 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -1,14 +1,24 @@ name: masterenv -channels: - - bioconda - - conda-forge - - defaults - - r dependencies: - - bedtools - - python - - r-essentials - r-seqinr - numpy - pybigWig + - cd-hit + - jellyfish + - r-base>=3.5.1 + - r-data.table + - r-pbapply + - r-igraph + - r-stringi + - r-stringr + - r-optparse + - bioconductor-iranges + - snakemake + - meme + - moods + - biopython + - pybedtools + - matplotlib + - seaborn + - crossmap diff --git a/nextflow.config b/nextflow.config index 6f44ba3..969a9be 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,16 +1,15 @@ -createTimeout = 60 - -params { - -} +wd = "/mnt/agnerds/Rene.Wiegandt/10_Master/masterJLU2018" +createTimeout = 40 +params.threads=60 //Parameter for for scripts! Not for nextflow processes. +params.config="${wd}/config/uropa.config" env { - path_bin = $workflow.projectDir/bin - path_env = $workflow.projectDir/masterenv.yml + path_bin = "${wd}/bin" + path_env = "${wd}/masterenv.yml" } -includeConfig '$workflow.projectDir/config/peak_calling.config' -includeConfig '$workflow.projectDir/config/1.2.config' -includeConfig '$workflow.projectDir/config/cluster.config' -includeConfig '$workflow.projectDir/config/motif_estimation.config' -includeConfig '$workflow.projectDir/config/create_gtf.config' +includeConfig "${wd}/config/peak_calling.config" +includeConfig "${wd}/config/filter_unknown_motifs.config" +includeConfig "${wd}/config/cluster.config" +includeConfig "${wd}/config/motif_estimation.config" +includeConfig "${wd}/config/create_gtf.config" diff --git a/pipeline.nf b/pipeline.nf index a8e15dc..a39616a 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -1,284 +1,649 @@ -//!/usr/bin/env nextflow - -Channel.fromPath(params.input).map {it -> [it.simpleName, it]}.set {bigwig_input} -Channel.fromPath(params.bed).set {bed_input} -Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} -Channel.fromPath(params.jaspar_db).into {db_for_motivscan; db_for_tomtom} -Channel.fromPath(params.config).set {config} - -bigwig_input.combine(bed_input).set{footprint_in} - -process footprint_extraction { - conda "${path_env}" - - tag{name} - - input: - set name, file (bigWig), file (bed) from footprint_in - - output: - set name, file ('*.bed') into bed_for_overlap_with_TFBS - - script: - """ - python ${path_bin}/call_peaks.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} - """ -} - -//Abfrage ob ausgeführt werden muss. -process extract_known_TFBS { - conda "${path_env}" - - input: - file (fasta) from fa_overlap - file (db) from db_for_motivscan - - output: - file ('*.bed') into known_TFBS_for_overlap - - script: - """ - """ -} - - -bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap).combine(fa_overlap_2).set {for_overlap} - - -process overlap_with_known_TFBS { - conda "${path_env}" - - input: - set file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap - - output: - file ('*.bed') into bed_for_clustering - - script: - motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") - """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_list} --fasta ${fasta} -o ${name_placeholder} -min ${params.min_size_fp} -max ${params.max_size_fp} - """ -} - - -process clustering { - conda "${path_env}" - - input: - file (bed) from bed_for_clustering - - output: - set name, file ('*.bed') into bed_for_motif_esitmation - - script: - """ - """ -} - - -// Converting BED-File to one FASTA-File per cluster -process bed_to_clustered_fasta { - conda "${path_env}" - - tag{name} - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' - - input: - set name, file (bed) from bed_for_motif_esitmation - - output: - file ('*.FASTA') into fasta_for_glam2 - - script: - """ - Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} - """ -} - - -//flatten list and adding name of file to channel value -fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} - - -//Running GLAM2 on FASTA-files. -//Generating Motifs through alignment and scoring best local matches. -process glam2 { - conda "${path_env}" - - tag{name} - - input: - set name, file (fasta) from fasta_for_glam2 - - output: - set name, file('*.meme') into meme_for_tomtom, meme_for_filter - - script: - """ - glam2 n ${fasta} -O . -a ${params.motif_min_len} -b ${params.motif_max_len} -z 5 - """ -} - - -//Running Tomtom on meme-files generated by GLAM2. -//Tomtom searches motifs in databases. -process tomtom { - conda "${path_env}" - - tag{name} - - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/tmp/', mode: 'copy' - - input: - set name, file (meme), file (jaspar_db) from meme_for_tomtom.combine(db_for_tomtom) - - output: - set name, file ('*.tsv') into tsv_for_filter - - script: - """ - tomtom ${meme} ${jaspar_db} -thresh ${params.tomtom_treshold} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv - """ -} - - -//Joining channels with meme and tsv files. Filter joined channel on line count. -//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. -for_filter = meme_for_filter.join( tsv_for_filter ) -for_filter - .filter { name, meme, tsv -> - long count = tsv.readLines().size() - count <= 1 - } - .into { meme_for_scan; check } - - -//If channel 'check' is empty print errormessage -process check_for_unknown_motifs { - echo true - - input: - val x from check.ifEmpty('EMPTY') - - when: - x == 'EMPTY' - - """ - echo '>>> STOPPED: No unknown Motifs were found.' - """ - -} - - -//Get the best(first) Motif from each MEME-file -process get_best_motif { - conda "${path_env}" - - input: - set name, file(meme), file(tsv) from meme_for_scan - - output: - set name, file('*_best.meme') into best_motif - - script: - """ - python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme - """ -} - - -best_motif.combine(fa_scan).set {files_for_genome_scan} - - -process genome_scan { - conda "${path_env}" - - input: - set name, file(meme), file(fasta) from files_for_genome_scan - - output: - file ('.bed') into bed_for_uropa, bed_for_cluster_quality - - script: - """ - """ -} - - -process cluster_quality { - - input: - file (bed) from bed_for_cluster_quality - - output: - file ('*.bed') into bed_for_final_filter - - script: - """ - """ -} - - -process create_GTF { - conda "${path_env}" - - publishDir 'Path', mode:'copy' - - output: - file ('*.gtf') into gtf_for_uropa - - script: - """ - python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} - """ -} - - -bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} - - -// Create configuration file for UROPA. -// Takes template and replaces bed- and gtf-placeholders with actual paths. -process create_uropa_config { - - publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' - - input: - set val(bed), val(gtf) from uropa_in.toList() - file (conf) from config - - output: - file ('uropa.config') into uropa_config - - script: - """ - sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final - """ -} - - -process UROPA { - - input: - file (config) from uropa_config - - output: - set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter - - script: - """ - """ -} - - -process filter { - - input: - - output: - - script: - """ - """ -} +#!/usr/bin/env nextflow + +//setting default values + params.bigwig="" + params.bed="" + params.genome_fasta="" + params.motif_db="" + params.config="" + params.tfbs_path="" + params.create_known_tfbs_path = "./" + params.help = 0 + params.out = "./out/" + +//peak_calling + params.window_length = 200 + params.step = 100 + params.percentage = 0 + +//filter_unknown_motifs + params.min_size_fp=10 + params.max_size_fp=100 + +//clustering + //reduce_bed + params.kmer=10 + params.aprox_motif_len=10 + params.motif_occurence=1 + params.min_seq_length=10 + + //cdhit_wrapper + params.global=0 + params.identity=0.8 + params.sequence_coverage=8 + params.memory=800 + params.throw_away_seq=9 + params.strand=0 + +//motif_estimation + //bed_to_clustered_fasta + params.min_seq = 100 // Minimum number of sequences in the fasta-files for glam2 + + //glam2 + params.motif_min_key = 8 // Minimum number of key positions (aligned columns) + params.motif_max_key = 20 // Maximum number of key positions (aligned columns) + params.iteration = 10000 // Number of Iterations done by glam2. A high iteration number equals a more accurate result but with an higher runtime. + + //tomtom + params.tomtom_treshold = 0.01 // threshold for similarity score. + + //cluster motifs + params.cluster_motif = 0 // Boolean if 1 motifs are clustered else they are not + params.edge_weight = 50 // Minimum weight of edges in motif-cluster-graph + motif_similarity_thresh = 0.00001 // threshold for motif similarity score + + params.best_motif = 3 // Top n motifs per cluster + +//creating_gtf + params.organism="hg38" + params.tissue="" + +if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ +log.info """ +Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] + +Required arguments: + --bigwig Path to BigWig-file + --bed Path to BED-file + --genome_fasta Path to genome in FASTA-format + --motif_db Path to motif-database in MEME-format + --config Path to UROPA configuration file + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') + --out Output Directory (Default: './out/') + +Optional arguments: + + --help [0|1] 1 to show this help message. (Default: 0) + --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + + Footprint extraction: + --window_length INT This parameter sets the length of a sliding window. (Default: 200) + --step INT This parameter sets the number of positions to slide the window forward. (Default: 100) + --percentage INT Threshold in percent (Default: 0) + + Filter unknown motifs: + --min_size_fp INT Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10) + --max_size_fp INT Maximum sequence length threshold. Discards all sequences longer than this value. (Default: 100) + + Clustering: + Sequence preparation/ reduction: + --kmer INT Kmer length (Default: 10) + --aprox_motif_len INT Motif length (Default: 10) + --motif_occurence FLOAT Percentage of motifs over all sequences. Use 1 (Default) to assume every sequence contains a motif. + --min_seq_length Interations Remove all sequences below this value. (Default: 10) + + Clustering: + --global INT Global (=1) or local (=0) alignment. (Default: 0) + --identity FLOAT Identity threshold. (Default: 0.8) + --sequence_coverage INT Minimum aligned nucleotides on both sequences. (Default: 8) + --memory INT Memory limit in MB. 0 for unlimited. (Default: 800) + --throw_away_seq INT Remove all sequences equal or below this length before clustering. (Default: 9) + --strand INT Align +/+ & +/- (= 1). Or align only +/+ (= 0). (Default: 0) + + Motif estimation: + --min_seq INT Sets the minimum number of sequences required for the FASTA-files given to GLAM2. (Default: 100) + --motif_min_key INT Minimum number of key positions (aligned columns) in the alignment done by GLAM2. (Default: 8) + --motif_max_key INT Maximum number of key positions (aligned columns) in the alignment done by GLAM2.f (Default: 20) + --iteration INT Number of iterations done by glam2. More Iterations: better results, higher runtime. (Default: 10000) + --tomtom_treshold float Threshold for similarity score. (Default: 0.01) + --best_motif INT Get the best X motifs per cluster. (Default: 3) + + Moitf clustering: + --cluster_motif Boolean If 1 pipeline clusters motifs. If its 0 it does not. (Defaul: 0) + --edge_weight INT Minimum weight of edges in motif-cluster-graph (Default: 5) + --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) + + Creating GTF: + --organism [hg38 | hg19 | mm9 | mm10] Input organism + --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON + config +All arguments can be set in the configuration files + ``` +""" +System.exit(2) +} else { + Channel.fromPath(params.bigwig).map {it -> [it.simpleName, it]}.set {bigwig_input} + Channel.fromPath(params.bed).into {bed_input; bed_for_tfbsscan} + Channel.fromPath(params.genome_fasta).into {fa_overlap; fa_scan; fa_overlap_2} + Channel.fromPath(params.motif_db).into {db_for_motivscan; db_for_tomtom} + Channel.fromPath(params.config).set {config} +} + +/* +Checking for parameter input! +*/ +int_params = ["window_length", "step", "min_size_fp", "max_size_fp", "kmer", + "aprox_motif_len", "motif_occurence", "min_seq_length", "global", + "sequence_coverage", "memory", "throw_away_seq", "strand", + "min_seq", "motif_min_key", "motif_max_key", "iteration", + "edge_weight", "best_motif"] +req_params = ["bigwig", "bed", "genome_fasta", "motif_db", "config"] + +valid_organism = ["hg38", "hg19", "mm9", "mm10"] + +params.each { key, value -> + if(int_params.contains(key)) { + if (!("${value}" ==~ /\d+/ )){ + println("ERROR: $key needs to be an Integer") + System.exit(2) + } + } + if(req_params.contains(key)) { + if(!file(value).exists()) { + println("ERROR: $key not found. Please check the given path.") + System.exit(2) + } + } +} +if (!("${params.identity}" ==~ /^0\.[8-9][[0-9]*]?|^1(\.0)?/ )){ + println("ERROR: --identity needs to be float in range 0.8 to 1.0") + System.exit(2) +} + +if (!valid_organism.contains(params.organism)) { + println("ERROR: Invalid organism! Valid organisms: " + valid_organism) + System.exit(2) +} +if (!("${params.percentage}" ==~ /\d+/ ) || params.percentage < 0 || params.percentage > 100 ){ + println("ERROR: --percentage needs to be an Integer between 0 and 100") + System.exit(2) +} + +if (!("${params.tomtom_treshold}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { + println("ERROR: --tomtom_treshold needs to be an Integer or float, e.g. 0.01") + System.exit(2) +} +if (!("${params.motif_similarity_thresh}" ==~ /([0-9]*\.[0-9]+|[0-9]+)/)) { + println("ERROR: --motif_similarity_thresh needs to be an Integer or float, e.g. 0.01") + System.exit(2) +} + + +path_bin = path_bin?.endsWith('/') ? path_bin.substring( 0, path_bin.length() -1 ) : path_bin + +bigwig_input.combine(bed_input).set{footprint_in} +/* +This process uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest +*/ +process footprint_extraction { + conda "${path_env}" + + tag{name} + publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' + + input: + set name, file (bigWig), file (bed) from footprint_in + + output: + set name, file ('*.bed') into bed_for_overlap_with_TFBS + + script: + """ + python ${path_bin}/footprints_extraction.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} + """ +} + +for_tfbs = fa_overlap.combine(db_for_motivscan).combine(bed_for_tfbsscan) + +/* + +*/ +process extract_known_TFBS { + + conda "${path_env}" + + publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' + + input: + set file (fasta), file (db), file (bed) from for_tfbs + + output: + val ('done') into known_TFBS_for_overlap + + when: + params.tfbs_path == "" + + script: + """ + python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} + """ +} + +/* +Sets path to known tfbs BED-files. If tfbs_path is set the process extract_known_TFBS is skipped, so the paths are differnt. +*/ +if(params.tfbs_path == "") { + bed_for_overlap_with_TFBS.combine(known_TFBS_for_overlap.toList()).combine(fa_overlap_2).set {for_overlap} + motif_path = params.create_known_tfbs_path +} else { + Channel.from('skipped').set {workaround} + bed_for_overlap_with_TFBS.combine(workaround).combine(fa_overlap_2).set {for_overlap} + tfbs = params.tfbs_path?.endsWith('/') ? params.tfbs_path: params.tfbs_path.substring( 0, params.tfbs_path.length() -1 ) + motif_path = tfbs +} + + +/* +*/ +process overlap_with_known_TFBS { + conda "${path_env}" + + publishDir "${params.out}/unknown_overlap/", mode :'copy' + + input: + set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap + + output: + set name, file ("${name}_unknown.bed") into bed_for_reducing + + script: + motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") + """ + ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} + """ +} + + +/* +*/ +process reduce_bed { + conda "${path_env}" + echo true + publishDir "${params.out}/cluster/reduced_bed/", mode: 'copy' + + input: + set name, file (bed) from bed_for_reducing + + output: + set name, file ('*.bed') into bed_for_clustering + + script: + """ + Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} + """ +} + + +/* +*/ +process clustering { + conda "${path_env}" + echo true + + publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed' + + input: + set name, file (bed) from bed_for_clustering + + output: + set name, file ('*.bed') into bed_for_motif_esitmation + + script: + """ + Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} + """ +} + + +/* +Converting BED-File to one FASTA-File per cluster +*/ +process bed_to_clustered_fasta { + conda "${path_env}" + publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' + tag{name} + + input: + set name, file (bed) from bed_for_motif_esitmation + + output: + file ('*.FASTA') into fasta_for_glam2 + file ('*.FASTA') into fasta_for_motif_cluster + + script: + """ + Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} + """ +} + + +//flatten list and adding name of file to channel value +fasta_for_glam2 = fasta_for_glam2.flatten().map {it -> [it.simpleName, it]} +//combine fasta files in one list +fasta_for_motif_cluster = fasta_for_motif_cluster.toList() + +/* +Running GLAM2 on FASTA-files. +Generating Motifs through alignment and scoring best local matches. +*/ +process glam2 { + + tag{name} + publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' + + input: + set name, file (fasta) from fasta_for_glam2 + + output: + file("${name}/*.meme") into meme_to_merge + set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_for_filter + file ('*') + + script: + """ + glam2 n ${fasta} -O ./${name}/ -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +/* +Combining all MEME-files to one big MEME-file. +The paths are sorted numerically depending on the cluster number. +*/ +process merge_meme { + + publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + + input: + val (memelist) from meme_to_merge.toList() + + output: + file ('merged_meme.meme') into merged_meme + + when: + params.cluster_motif == 1 + + script: + memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } + meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } + meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") + """ + meme2meme ${meme_list} > merged_meme.meme + """ +} + +/* +Running Tomtom on merged meme-files. +Output table has the information which clusters are similar to each other. +*/ +process find_similar_motifs { + + publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' + input: + file (merged_meme) from merged_meme + + output: + file ('all_to_all.tsv') into motif_similarity + + when: + params.cluster_motif == 1 + + script: + """ + tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv + """ +} + + +files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + + +/* +Merging FASTA-files of similar clusters +*/ +process merge_fasta { + conda "${path_env}" + publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' + echo true + input: + set file (motiv_sim), val (fasta_list) from files_for_merge_fasta + + output: + file ('*.fasta') into motif_clustered_fasta_list + file('*.png') + + when: + params.cluster_motif == 1 + + script: + fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } + fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") + """ + Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + """ +} + + +motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() + + +process clustered_glam2 { + + publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' + + input: + file (fasta) from motif_clustered_fasta_flat + + output: + set name, file ('*.meme') into clustered_meme_for_tomtom + set name, file ('*.meme') into clustered_meme_for_filter + file('*') + + when: + params.cluster_motif == 1 + + script: + name = fasta.getBaseName() + """ + glam2 n ${fasta} -O . -a ${params.motif_min_key} -b ${params.motif_max_key} -z 5 -n ${params.iteration} + """ +} + +if(params.cluster_motif == 1){ + for_tomtom = clustered_meme_for_tomtom + for_filter = clustered_meme_for_filter +} else { + for_tomtom = meme_for_tomtom + for_filter = meme_for_filter +} + + +/* +Running Tomtom on meme-files generated by GLAM2. +Tomtom searches motifs in databases. +*/ +process tomtom { + + tag{name} + publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy' + + input: + set name, file (meme) from for_tomtom + + output: + set name, file ('*.tsv') into tsv_for_filter + + script: + """ + tomtom ${meme} ${params.motif_db} -thresh ${params.tomtom_treshold} -mi 1 -text | sed '/^#/ d' | sed '/^\$/d' > ${name}_known_motif.tsv + """ +} + + +//Joining channels with meme and tsv files. Filter joined channel on line count. +//Only meme-files which corresponding tsv files have linecount <= 1 are writen to next channel. +for_filter2 = for_filter.join( tsv_for_filter ) +for_filter2 + .filter { name, meme, tsv -> + long count = tsv.readLines().size() + count <= 1 + } + .into { meme_for_scan; check } + + +//If channel 'check' is empty print errormessage +process check_for_unknown_motifs { + echo true + + input: + val x from check.ifEmpty('EMPTY') + + when: + x == 'EMPTY' + + """ + echo '>>> STOPPED: No unknown Motifs were found.' + """ + +} + + +//Get the best(first) Motif from each MEME-file +process get_best_motif { + conda "${path_env}" + + publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' + + input: + set name, file(meme), file(tsv) from meme_for_scan + + output: + set name, file('*_best.meme') into best_motif + + script: + """ + python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + """ +} + + +best_motif.combine(fa_scan).set {files_for_genome_scan} + +/* +process genome_scan { + conda "${path_env}" + + input: + set name, file(meme), file(fasta) from files_for_genome_scan + + output: + file ('.bed') into bed_for_uropa, bed_for_cluster_quality + + script: + """ + """ +} + + +process cluster_quality { + + input: + file (bed) from bed_for_cluster_quality + + output: + file ('*.bed') into bed_for_final_filter + + script: + """ + """ +} */ + + +process create_GTF { + conda "${path_env}" + + publishDir "${params.out}/gtf/", mode: 'copy' + + output: + file ('*.gtf') into gtf_for_uropa + + script: + """ + python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin} + """ +} + +/* +bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in} + + +// Create configuration file for UROPA. +// Takes template and replaces bed- and gtf-placeholders with actual paths. +process create_uropa_config { + + publishDir '/mnt/agnerds/Rene.Wiegandt/10_Master/', mode: 'copy' + + input: + set val(bed), val(gtf) from uropa_in.toList() + file (conf) from config + + output: + file ('uropa.config') into uropa_config + + script: + """ + sed -- 's/placeholder_gtf/${gtf}/g; s/placeholder_bed/${bed}/g' ${conf} > uropa.config.final + """ +} + + +process UROPA { + + input: + file (config) from uropa_config + + output: + set file ("*_allhits.txt"), file ("*_finalhits.txt") into uropa_for_filter + + script: + """ + """ +} + + +process filter { + + input: + + output: + + script: + """ + """ +} */ + +workflow.onComplete { +log.info""" +Pipeline execution summary +--------------------------- +Completed at: ${workflow.complete} +Duration : ${workflow.duration} +Success : ${workflow.success} +workDir : ${workflow.workDir} +exit status : ${workflow.exitStatus} +Error report: ${workflow.errorReport ?: '-'} +""" +}