diff --git a/CHANGES b/CHANGES index 160d151..55cf490 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,7 @@ ## 1.0.1 (2017-01-27) +- Edit pysam.TabixFile (Error :attribute doesn't exist) -> pysam.Tabixfile +- add flag "-p gff" in run_tabix call (overlaps.py) - updated summary script for new gridExtra requirements (rows=NULL instead of show.rownames=FALSE) - fixed log file bug - fixed internals feature to work for distances larger than default distance diff --git a/README.md b/README.md index a2951c0..805d692 100644 --- a/README.md +++ b/README.md @@ -35,51 +35,64 @@ Installation and Command-line usage Make sure all prerequisites are met: - [R/Rscript](http://www.r-project.org/) (v3.3.0 or higher, follow instructions on url) -- [Python](http://continuum.io/downloads) (v2.7.8-anaconda-2.1.0, install with ```bash Anaconda2-4.2.0-Linux-x86_64.sh``` and ```PATH=dir/to/python_anaconda:$PATH```) -- [htslib](http://www.htslib.org/download/) (1.3.2 or higher, follow instructions on url) +- [Python](https://repo.continuum.io/archive/) (v2.7.8-anaconda-2.1.0) + * Choose Anaconda2-4.2.0-Linux-x86 (for 32bit) or -x86_64 (for 64bit), + * Install with ```bash Anaconda2-4.2.0-Linux-x86_64.sh```, + * Follow instructions and type 'yes' to prepend Anaconda to PATH in your .bashrc file, + * or type after installation: ``` $export PATH=/home//anaconda2/bin:$PATH ``` + +- [htslib](http://www.htslib.org/download/) (1.3.2 or higher) + Follow instructions : + * ```cd htslib-1.3.2/``` + * ```./configure``` + * ```make``` + * ```sudo make install ``` (will be installed to /usr/local/bin/) + * or ```sudo make prefix=/home//htslib install``` + Further information given in [url](https://github.com/samtools/htslib/blob/develop/INSTALL) Install required packages for R: ```bash install.packages(c("ggplot2", "devtools", "gplots", "gridExtra", "jsonlite", "VennDiagram")) source("https://bioconductor.org/biocLite.R") biocLite(c("RBGL","graph")) -# to install the last required package, devtools has to be loaded to use the install from github function + +# To install the last required package-Vennerable, devtools has to be loaded to use the install from github function # if you copy this and want to use it in once, # make sure the library loading is not interrupted by the question if present packages should be updated + library(devtools) install_github("jenzopr/Vennerable") ``` - -### Install UROPA locally +Install UROPA locally by running: ```bash git clone https://github.molgen.mpg.de/loosolab/UROPA.git -export PATH=$PATH:dir/to/uropa +export PATH=$PATH:dir/to/uropa/uropa.py ``` -```bash -Usage: uropa [options] +```python +Usage: uropa.py [options] Available options: - -h, --help print this help message and further details on the configuration file - -i, --input filename of configuration file - -o, --output directory for results and prefix of the output file name - -r, --reformat create an additional compact and line-reduced table as result file - -s, --summary filename of additional visualisation of results in graphical format - -t n, --threads n multiprocessed run: n = number of threads to run annotation process - -no-comment do not show comment lines in output files - -l, --log log file name for messages and warnings - -d, --debug print verbose messages (for debugging purposes) - -v, --version print the version and exit + - h, --help print this help message and further details on the configuration file + - i, --input filename of configuration file + - o, --output directory for results and prefix of the output file name + - r, --reformat create an additional compact and line-reduced table as result file + - s, --summary filename of additional visualisation of results in graphical format + - t n, --threads n multiprocessed run: n = number of threads to run annotation process + --no-comment do not show comment lines in output files + - l, --log log file name for messages and warnings + - d, --debug print verbose messages (for debugging purposes) + - v, --version print the version and exit ``` How to cite ----------- -Kondili M, Fust A, Preussner J, Kuenne C, Braun T, and Looso M, UROPA: a tool for Universal RObust Peak Annotation. *(in preparation)* +Kondili M, Fust A, Preussner J, Kuenne C and Looso M. UROPA: a tool for Universal RObust Peak Annotation. *(in preparation)* Contribute ---------- diff --git a/docs_rst/help.rst b/docs_rst/help.rst index 5f7deaf..ee5c91d 100644 --- a/docs_rst/help.rst +++ b/docs_rst/help.rst @@ -16,5 +16,6 @@ They will be addressed as soon as possible. * **How to annotate with default values?** In order to activate default values, the key itself shouldn't be present in the config file. If you want to annotate with default values only, do not remove the queries key, but leave it empty like "queries":[] - +* **Why is the summary empty?** + It is a problem about gridExtra and ggplot2, the summary starts from page 2 diff --git a/docs_rst/index.rst b/docs_rst/index.rst index 945898f..564a7d6 100644 --- a/docs_rst/index.rst +++ b/docs_rst/index.rst @@ -17,7 +17,7 @@ annotation. - Utilization of any available GTF files as annotation reference - Multiple queries can be processed in a single run - Graduated annotation due to prioritization -- Multiple output tables (AllHits, FinalHits, BestperQuery\_Hits) +- Multiple output tables (allhits, finalhits, besthits) - Visual summary for annotation evaluation - Preparation of custom annotation files with the UROPA to GTF utility @@ -55,4 +55,4 @@ The project is licensed under the MIT license (see :doc:`/license`) custom license help - \ No newline at end of file + diff --git a/docs_rst/install.rst b/docs_rst/install.rst index 525b218..14a3449 100644 --- a/docs_rst/install.rst +++ b/docs_rst/install.rst @@ -5,61 +5,55 @@ Prerequisites ----------------- For running UROPA locally, the following prerequisites have to be met: -- `Python`_, v2.7 (packages: numpy, sampy) -- `R/Rscript`_, v3.3.0 or higher (packages: devtools, ggplot2, gplots, gridExtra, jsonlite, RBGL, graph, VennDiagram, Vennerable, snow) -- `htslib`_ 1.3.2 or higher - -Instructions ------------------ - -Python -~~~~~~~~~~ -As numpy and sampy packages can be taxing to install in Python, we recommend the usage of the `Anaconda`_ distribution instead (v2.7.8-anaconda-2.1.0 or higher). Install with eg. ``bash Anaconda2-4.2.0-Linux-x86_64.sh`` and include the path in the environment variable ``export PATH=dir/to/python_anaconda:$PATH``. - -Using standard Python 2.7, the packages can be installed with ``pip install pysam numpy``. - -R -~~~~~ -The following packages are hosted by CRAN and can be installed from the R console with the syntax ``install.packages("packagename")``. - -- `devtools`_ -- `ggplot2`_ -- `gplots`_ -- `gridExtra`_ -- `jsonlite`_ -- `VennDiagram`_ -- `snow`_ - -The RBGL and graph packages are hosted by BioConductor. To install those start R and type ``source("https://bioconductor.org/biocLite.R")`` and ``biocLite(c("RBGL","graph"))``. - -Vennerable has to be installed with ``library("devtools")`` followed by ``install_github("jenzopr/Vennerable")``. - -HTSlib -~~~~~ -The HTSlib library is necessary for the indexing of the reference features using Tabix. For installation instructions please refer to the following url: `htslib`_. - +- `Python`_, v2.7 + - download Anaconda for Linux version Python 2.7 to direction where python should be installed + + - run ``bash Anaconda2-4.3.0-Linux-x86_64.sh`` + + - Answer the question "Do you wish the installer to prepend the Anaconda2 install location to PATH in your /home/.../.bashrc ?" with yes OR do ``PATH=dir/to/python_anaconda:$PATH`` after the installation process has finished + + - run ``conda install -c bioconda pysam`` + + - if you are NOT using the anaconda version of python 2, the packages `pysam`_ and `numpy`_ can be installed with ``pip install pysam numpy`` + +- `R/Rscript`_, v3.3.0 or higher (follow the instructions on url) + +Load R and install packages: + + - ``install.packages(c("ggplot2", "devtools", "gplots", "gridExtra", "jsonlite", "VennDiagram"))`` + + Choose CRAN mirror. + - ``source("https://bioconductor.org/biocLite.R")`` + + - ``biocLite(c("RBGL","graph"))`` + + - ``library(devtools)`` + + - ``install_github("jenzopr/Vennerable")`` + +- `git`_ with ``bash sudo apt-get install git`` UROPA -~~~~~ - +----- UROPA itself can be installed by simply cloning the Github library and adding the target folder to the system environment variable. .. code:: bash git clone https://github.molgen.mpg.de/loosolab/UROPA - export PATH=$PATH:dir/to/uropa - + ## Open .bashrc file and create alias name to call uropa in each session easily : + alias uropa='/dir/to/UROPA/uropa.py' .. _R/Rscript: http://www.r-project.org/ .. _Python: http://continuum.io/downloads .. _Anaconda: http://continuum.io/downloads -.. _htslib: http://www.htslib.org/download/ +.. _git: https://git-scm.com/ .. _numpy: http://www.numpy.org .. _pysam: https://pysam.readthedocs.io/en/latest/index.html .. _ggplot2: https://cran.r-project.org/web/packages/ggplot2/index.html .. _gplots: https://cran.r-project.org/web/packages/gplots/index.html .. _gridExtra: https://cran.r-project.org/web/packages/gridExtra/index.html +.. _gridExtra: https://cran.r-project.org/web/packages/gridExtra/index.html .. _jsonlite: https://cran.r-project.org/web/packages/jsonlite/index.html .. _VennDiagram: https://cran.r-project.org/web/packages/VennDiagram/index.html .. _snow: https://cran.r-project.org/web/packages/snow/index.html diff --git a/docs_rst/output.rst b/docs_rst/output.rst index 5d78fef..1753e3b 100644 --- a/docs_rst/output.rst +++ b/docs_rst/output.rst @@ -1,27 +1,28 @@ -Output tables -============= +Output files +============ UROPA provides multiple output files, each providing valuable information in either a more extended or a more condense way in order to cover the needs of a user. Details are given below: File overview ------------- -- **Uropa_AllHits**: Basic output table, reports for each peak all valid annotations and additionally NA rows for peaks without valid annotation. +- **allhits.txt**: Basic output table, reports for each peak all valid annotations and additionally NA rows for peaks without valid annotation. -- **Uropa_FinalHits**: Filtered output table, it reports the best (closest) feature according to the config criteria for each peak. If multiple queries are given, it reports the best annotation taking multiple queries into account. +- **finalhits.txt**: Filtered output table, it reports the best (closest) feature according to the config criteria for each peak. If multiple queries are given, it reports the best annotation taking multiple queries into account. -- **Uropa_BestperQuery_Hits**: This table is only produced if more than one query is given. It reports the best annotation per query for each peak. +- **besthits.txt**: This table is only produced if more than one query is given. It reports the best annotation per query for each peak. -- **Uropa_Reformatted_HitsperPeak**: Another format of the BestperQuery Hits produced by using the ``-r`` parameter. A compact table with best per query annotation for each peak in one row is presented. +- **besthits_compact.txt**: Another format of the besthits produced by using the ``-r`` parameter. A compact table with best per query annotation for each peak in one row is presented. -- **Uropa_Summary**: Statistical summary of the UROPA annotation by using the ``-s`` parameter. +- **summary.pdf**: Statistical summary of the UROPA annotation generated by using the ``-s`` parameter. .. note:: - The output files will be named additionally by the output directory name. - For example if uropa is used like this: ``uropa.py -i config.json -o ChIPanno`` + If no prefix is specified, the result files will be stored in call dir with the basename of the config file. + E.g. ``uropa.py -i histonMarkPeaks.json`` The allhits file would be called histonMarkPeaks_allhits.txt + With specified prefix, the result files will be stored in the call dir with the prefix as part of the names. + E.g. ``uropa.py -i histonMarkPeaks.json -p abc`` the allhits file would be called abc_allhits.txt, + or with ``-p xy/abc`` all results will be stored in folder xy, which will be created if it does not exist already. - The result would contain: ChIPannot/Uropa_AllHits_ChIPannot.txt - Output columns explanation -------------------------- @@ -52,17 +53,17 @@ The four output tables mentioned above contain informative columns about the per Output files (one query) ------------------------ -UROPA annotation with one query results in two output tables. Those are the Uropa_AllHits and Uropa_FinalHits. -For a configuration as given below, the AllHits is given in Table 1, and the FinalHits is displayed in Table 2. +UROPA annotation with one query results in two output tables. Those are the allhits and finalhits. +For a configuration as given below, the allhits is given in Table 1, and the finalhits is displayed in Table 2. Peak and annotation files are further described in the :doc:`/uropa-example` section. The UROPA annotation process for one query can run into three cases for each peak: -- **Case 1**: No query gives any feature for annotating the peak, this leads to no valid annotation at all -> NA row in AllHits and FinalHits. +- **Case 1**: No query gives any feature for annotating the peak, this leads to no valid annotation at all -> NA row in allhits and finalhits. -- **Case 2**: There is one valid annotation for the specified query -> annotation will be given in AllHits and FinalHits. +- **Case 2**: There is one valid annotation for the specified query -> annotation will be given in allhits and finalhits. -- **Case 3**: There are multiple valid annotations for the specified query -> all valid annotations will be given in the AllHits, the best annotation (smallest distance) will be presented in the FinalHits. +- **Case 3**: There are multiple valid annotations for the specified query -> all valid annotations will be given in the allhits, the best annotation (smallest distance) will be presented in the finalhits. .. code:: json @@ -102,7 +103,7 @@ The UROPA annotation process for one query can run into three cases for each pea +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+-----------+----------------+-------+ -**Table 5.1:** Excerpt of table AllHits for one query as described in the configuration above. +**Table 5.1:** Excerpt of table allhits for one query as described in the configuration above. +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+-----------+----------------+-------+ | peak_id | peak_chr | peak_start | peak_center | peak_end | peak_strand | feature | feat_start | feature_end | feat_strand | feat_anchor | distance | genomic_location | feat_ovl_peak | peak_ovl_feat | gene_name | gene_type | query | @@ -125,22 +126,22 @@ The UROPA annotation process for one query can run into three cases for each pea +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+-----------+----------------+-------+ -**Table 5.2:** Excerpt of table FinalHits for one query as described in the configuration above. +**Table 5.2:** Excerpt of table finalhits for one query as described in the configuration above. As displayed in Table 1 and Table 2, peak 355 is a representative of Case 1. There is no valid annotation at all, thus there is an NA row in both output tables. -The peaks 356 and 765 belong to Case 2, there is one valid annotation for them, their annotation is displayed in the same way in AllHits and FinalHits. +The peaks 356 and 765 belong to Case 2, there is one valid annotation for them, their annotation is displayed in the same way in allhits and finalhits. -Peak 769 has three valid annotations for the specified query (Case 3). All of them are displayed in the AllHits output. -In the FinalHits only the best annotation, the one for gene NOP16 with the minimal distance of 937, is represented. +Peak 769 has three valid annotations for the specified query (Case 3). All of them are displayed in the allhits output. +In the finalhits only the best annotation, the one for gene NOP16 with the minimal distance of 937, is represented. Output files (multiple queries) -------------------------------- UROPA annotation with multiple queries and default priority results in at least three output tables. -Those are the Uropa_AllHits, Uropa_FinalHits, and Uropa_BestperQuery_Hits. -If the ``-r`` parameter is added in the command line call, there will the additional output Uropa_Reformatted_HitsperPeak file. -Furthermore, if the ``-s`` parameter is also added, the Uropa_Summary file is generated. +Those are the allhits, finalhits, and besthits. +If the ``-r`` parameter is added in the command line call, there will the additional output compact file. +Furthermore, if the ``-s`` parameter is also added, the summary file is generated. With a configuration as given below, the generated output files are generated as presented in Tables 3 to 6 and Figure 1. Peak and annotation files are further described in the :doc:`/uropa-example` section. @@ -149,8 +150,7 @@ The UROPA annotation process for multiple queries allows an additional case in r - **Case 1 to 3** as described for one query -- **Case 4**: There are valid annotations for one peak for multiple queries -> all valid annotations will be given in the AllHits, the best annotation (smallest distance across all queries) will be presented in the FinalHits. -Additionally, the best annotation per query will be displayed in the BestperQuery_Hits output. +- **Case 4**: There are valid annotations for one peak for multiple queries -> all valid annotations will be given in the allhits, the best annotation (smallest distance across all queries) will be presented in the finalhits. Additionally, the best annotation per query will be displayed in the besthits output. .. code:: json @@ -210,7 +210,7 @@ Additionally, the best annotation per query will be displayed in the BestperQuer | … | | | | | | | | | | | | | | | | | | +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+---------------+----------------+-------+ -**Table 5.3:** Excerpt of table AllHits for three queries as described in the configuration above. +**Table 5.3:** Excerpt of table allhits for three queries as described in the configuration above. +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+---------------+----------------+-------+ | peak_id | peak_chr | peak_start | peak_center | peak_end | peak_strand | feature | feat_start | feature_end | feat_strand | feat_anchor | distance | genomic_location | feat_ovl_peak | peak_ovl_feat | gene_name | gene_type | query | @@ -232,7 +232,7 @@ Additionally, the best annotation per query will be displayed in the BestperQuer | … | | | | | | | | | | | | | | | | | | +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+---------------+----------------+-------+ -**Table 5.4:** Excerpt of table FinalHits for three queries as described in the configuration above. +**Table 5.4:** Excerpt of table finalhits for three queries as described in the configuration above. +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+---------------+----------------+-------+ | peak_id | peak_chr | peak_start | peak_center | peak_end | peak_strand | feature | feat_start | feature_end | feat_strand | feat_anchor | distance | genomic_location | feat_ovl_peak | peak_ovl_feat | gene_name | gene_type | query | @@ -270,23 +270,23 @@ Additionally, the best annotation per query will be displayed in the BestperQuer | … | | | | | | | | | | | | | | | | | | +----------+----------+------------+-------------+------------+-------------+---------+------------+-------------+-------------+-------------+----------+-------------------+---------------+---------------+---------------+----------------+-------+ -**Table 5.5:** Excerpt of table BestperQuery_Hits for three queries as described in the configuration above. +**Table 5.5:** Excerpt of table besthits for three queries as described in the configuration above. .. note:: - The BestperQuery_Hits is only generated if multiple queries are specified and the priority flag is set to FALSE! If this flag is TRUE, there will be only one valid query. There can be multiple valid annotations for one peak, but all based on one query. In this case only the AllHits and FinalHits are produced. + The besthits is only generated if multiple queries are specified and the priority flag is set to FALSE! If this flag is TRUE, there will be only one valid query. There can be multiple valid annotations for one peak, but all based on one query. In this case only the allhits and finalhits are produced. Same as in the first example with one query, peak_355 has no valid annotation and is represented as NA row (correspond to Case 1). -In the AllHits (Table 3) and BestperQuery_Hits (Table 5) there will be one NA row for each query. In the FinalHits (Table 4) there will be only one NA row for all queries. +In the allhits (Table 3) and besthits (Table 5) there will be one NA row for each query. In the finalhits (Table 4) there will be only one NA row for all queries. -The peak_356 has only for one query a valid annotation, as given in AllHits, FinalHits, and BestperQuery_Hits (Case 2). In AllHits and BestperQuery_Hits there are additional NA rows for this peak for the other queries without a hit. +The peak_356 has only for one query a valid annotation, as given in allhits, finalhits, and besthits (Case 2). In allhits and besthits there are additional NA rows for this peak for the other queries without a hit. -For peak_765 there are valid annotations for all given queries as displayed in the AllHits, (Case 4). The best of these is the annotation for the lincRNA, therefore this annotation is displayed in the FinalHits. -Because there is only one valid annotation for each query, they will be displayed in the same way in the BestperQuery_Hits. +For peak_765 there are valid annotations for all given queries as displayed in the allhits, (Case 4). The best of these is the annotation for the lincRNA, therefore this annotation is displayed in the finalhits. +Because there is only one valid annotation for each query, they will be displayed in the same way in the besthits. -This is different for peak_769. As described above this peaks equates to Case 3. With multiple queries, there will be additional NA rows for the invalid queries in the AllHits and BestperQuery_Hits. +This is different for peak_769. As described above this peaks equates to Case 3. With multiple queries, there will be additional NA rows for the invalid queries in the allhits and besthits. -With multiple queries UROPA provides an option to reformat the BestperQuery_Hits table in order to condense the best per query annotations for each peak in one row. -A reformatted example for the BestperQuery_Hits of Table 5 is presented in Tables 6. +With multiple queries UROPA provides an option to reformat the besthits table in order to condense the best per query annotations for each peak in one row. +A reformatted example for the besthits of Table 5 is presented in Tables 6. The Reformatted_HitsperPeak represents all information for each peak in one row. Within this format the information for query 0 is always given at the first position, for query 1 at second positon etc. To receive this output format, the parameter ``-r`` has to be added to the command line call. @@ -328,7 +328,7 @@ Detailed information about the different plots and how they are dedicated to the - Specified queries, and value of the priority flag (Figure 1A) - Number of peaks per query -**Graphs based on the 'FinalHits' output:** +**Graphs based on the 'finalhits' output:** - Density plot displaying the distance per feature across all queries (Figure 1B) - Pie chart illustrating the genomic locations of the peaks per annotated feature (Figure 1C) @@ -336,7 +336,7 @@ Detailed information about the different plots and how they are dedicated to the *Figure 1A-C would be the summary for the first example UROPA run given above with only one query* -**Graphs based on the 'BestperQuery_Hits' output:** +**Graphs based on the 'besthits' output:** - Distribution of the distances per feature per query is displayed in a histogram (Figure 1D) - Pie chart illustrating the genomic locations of the peaks per annotated feature (not illustrated) @@ -346,5 +346,5 @@ Detailed information about the different plots and how they are dedicated to the .. figure:: img/output-formats-summary.png Figure 1: Summary file example for queries as described above: (A) Summary of specified queries, used annotation and peak files, and how many peaks were present and annotated, - (B) Distance density for all features based on FinalHits, (C) Pie Chart representing genomic location for each feature across FinalHits, - (D) Distance per query per feature across BestperQuery_Hits, (E) Pairwise comparison across all queries displayed in Venn diagrams, (F) Chow Ruskey plot to compare all queries. + (B) Distance density for all features based on finalhits, (C) Pie Chart representing genomic location for each feature across finalhits, + (D) Distance per query per feature across besthits, (E) Pairwise comparison across all queries displayed in Venn diagrams, (F) Chow Ruskey plot to compare all queries. diff --git a/uropa.py b/uropa.py index 9c82d06..df26e5a 100755 --- a/uropa.py +++ b/uropa.py @@ -29,7 +29,7 @@ import uropa.overlaps as ovls import uropa.annotation as ant -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser( prog="uropa.py", @@ -117,6 +117,8 @@ if args.log is not None: logpath = os.path.dirname(args.log) + # when no dir given, consider current dir to save it + logpath = os.getcwd() if logpath =="" or logpath=="." else logpath if not os.path.exists(logpath): try: os.makedirs(logpath) @@ -143,7 +145,7 @@ logger.error("File %s contains malformed JSON. %s", config, e) sys.exit() - parameters = cfg.parse_parameters(cfg_dict, log=None) # ,logger + parameters = cfg.parse_parameters(cfg_dict, logger) priority = parameters["priority"] annot_gtf = parameters["gtf"] peaks_bed = parameters["bed"] @@ -286,9 +288,9 @@ logger.warning( "Split command not available. Falling back to one thread.") -# -# Processing peaks -# + # + # Processing peaks + # input_args = [outdir, gtf_index, query_attributes, queries, max_distance, pr, gtf_has_chr] # Output FileNames common for any thread option @@ -456,3 +458,8 @@ os.remove(gtf_cut_file + ".sorted") logger.info("End time: %s", datetime.datetime.now().strftime("%d.%m.%Y %H:%M")) + + +#Call main to run uropa: +if __name__ == '__main__': + main() diff --git a/uropa/annotation.py b/uropa/annotation.py index 9469cac..8969bec 100644 --- a/uropa/annotation.py +++ b/uropa/annotation.py @@ -12,7 +12,7 @@ def annotation_process(input_args, peak_file, log=None): priority, gtf_has_chr] = input_args try: - anno = pysam.TabixFile(gtf_index) + anno = pysam.Tabixfile(gtf_index) except IOError: log.error("Unable to open tabix index {}.".format(gtf_index)) except ValueError: @@ -64,7 +64,7 @@ def annotation_process(input_args, peak_file, log=None): str(peak['estart']) + '-' + str(peak['eend']) try: - hits = anno.fetch(tabix_query, multiple_iterators=True) + hits = anno.fetch(tabix_query) except ValueError: hits = list() if log is not None: @@ -131,11 +131,10 @@ def annotation_process(input_args, peak_file, log=None): if not valid_queries: has_hits.append(False) - # ----- All hits parsed. Check only hits with valid-queries now ----- # - #log.debug("\n---All hits parsed for the peak.Totally, the Peak has {} hits from which {} are Valid ---".format( len(has_hits), has_hits.count(True) )) - # 9 cols= feat, f.start, f.end, f.strand, dist, min_pos, genom_loc, - # feat-to-peak-ovl , peak-to-feat-ovl (no Q) + # All hits parsed. Check only hits with valid-queries now.. nas_len = len(attrib_k) + 9 + # 9 cols= feat, f.start, f.end, f.strand, dist, min_pos, genom_loc, ++ # feat-to-peak-ovl , peak-to-feat-ovl (no Q) NAsList_q = list(np.repeat("NA", nas_len)) # Search hits with only Prior or Secondary Queries @@ -148,18 +147,18 @@ def annotation_process(input_args, peak_file, log=None): if queries[j]["direction"] != ['any_direction']: correct_dir = ovls.define_direction(queries[j]["direction"], int(hit[3]), int( hit[4]), strand, peak['length'], int(peak['start']), int(peak['end']), peak['center']) - #logger.debug("\nFor the hit {}-{}, direction is correct : {}".format(hit[3],hit[4] ,correct_dir)) + if correct_dir: # IF direction of hit is the desired, then filter - # for Distance + # for Distance (Dhit) min_pos, Dhit = ovls.distance_to_peak_center(peak['center'], int( hit[3]), int(hit[4]), strand, queries[j]["feature.anchor"]) - #logger.debug("\nClosest Distance from the peak center : {} ".format(Dhit)) + if not correct_dir and Best_hits_tab[j][peak['id']] == "" and All_hits_tab[j][peak['id']] == "": # hit with Incorrect Dir: move to new peak, fill in NAs if peak is empty - #log.debug("\nThe hit doesn't have the correct direction -> NAs in All & Best hits table -> Continue to New hit.") + #The hit doesn't have the correct direction,All & Best hits = NA All_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ 'start'], peak['center'], peak['end'], NAsList_q, str(j) + "\n"])) Best_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ @@ -173,7 +172,7 @@ def annotation_process(input_args, peak_file, log=None): elif queries[j]['direction'] == ['any_direction']: min_pos, Dhit = ovls.distance_to_peak_center(peak['center'], int( hit[3]), int(hit[4]), strand, queries[j]["feature.anchor"]) - #log.debug("\nClosest Distance(Dhit) when No Direction defined :{} ".format(Dhit)) + # > Check OVERLAPS of peak-hit, define genomic_location genomic_location = "not.specified" @@ -194,13 +193,13 @@ def annotation_process(input_args, peak_file, log=None): if len(queries[j]["distance"]) == 1: d_is_best = ovls.get_distance_by_dir([min_dist[j][peak['id']][0], min_dist[j][ peak['id']][0]], genomic_location, internals_location, Dhit) - - # print "Best distance found :{}".format(d_is_best) - if d_is_best and not same_gene: # Dhit < Dbest - #log.debug("\nDistance of Hit from Peak Center = {}, INFERIOR to current Min Distance = {} ".format(Dhit, min_dist[j][peak['id']][0])) + #Best distance found = d_is_best + + # Dhit < Dbest + if d_is_best and not same_gene: + #> Update the Min_distance min_dist[j][peak['id']] = ovls.get_besthit(j, len(queries[j]["distance"]), peak[ 'id'], hit, attrib_k, Dhit, min_dist) - #print("Minimum distance updated: {}".format(min_dist[j][peak['id']][0])) Best_res = ovls.create_table(peak['name'], peak['chr'], peak['start'], peak['end'], str( peak['center']), min_dist[j][peak['id']], attrib_k, min_pos, genomic_location, ovl_pk, ovl_feat, j) Best_hits_tab[j][peak['id']] = Best_res @@ -208,7 +207,7 @@ def annotation_process(input_args, peak_file, log=None): # Dhit > Dbest # only if empty fill in with NAs elif not d_is_best and not same_gene and Best_hits_tab[j][peak['id']] == "": - #log.debug("\nDistance of Hit from Peak Center = {}, LARGER THAN current Min Distance = {}\n".format(Dhit, min_dist[j][peak['id']][0])) + Best_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ 'start'], str(peak['center']), peak['end'], NAsList_q, str(j) + "\n"])) @@ -222,27 +221,25 @@ def annotation_process(input_args, peak_file, log=None): "distance"][0]], genomic_location, internals_location, Dhit) if Dhit_smaller and not same_gene: - #log.debug("Distance of hit SMALLER than Config Distance -> Hit written to All_Hits.") + All_hits_tab[j][peak['id']] = ovls.write_hit_to_All(All_hits_tab, peak['id'], attrib_k, Dhit, hit, peak['name'], peak['chr'], peak['start'], peak['end'], str(peak['center']), min_pos, genomic_location, ovl_pk, ovl_feat, j) # Hit not fitting in any of the distance values if not Dhit_smaller and queries[j]["internals"] != ["True"] and (All_hits_tab[j][peak['id']] == ''): - #log.debug("\nDhit= {} is LARGER than config Distance:{}.".format(Dhit, queries[j]["distance"])) - # Write over empty or NA the new NA, for new hit,so - # each hit is parsed. + + # Write over empty or NA the new NA, for new hit,so each hit is parsed. All_hits_tab[j][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ 'start'], str(peak['center']), peak['end'], NAsList_q, str(j) + "\n"])) # Hits with further Distance -> Check for Internals if not Dhit_smaller and queries[j]["internals"] == ["True"] and not same_gene: if "Inside" in genomic_location: - #log.debug("\n-> Hit is 'Internal' with distance LARGER than config.distance.It will be recorded to the All_hits table.") All_hits_tab[j][peak['id']] = ovls.write_hit_to_All(All_hits_tab, peak['id'], attrib_k, Dhit, hit, peak['name'], peak['chr'], peak['start'], peak['end'], str(peak['center']), min_pos, genomic_location, ovl_pk, ovl_feat, j) if not Dhit_smaller and All_hits_tab[j][peak['id']] != '': - #log.debug("Hit has larger distance than allowed ") + #Hit has larger distance than allowed continue # > Add to Best Hits the Best Internal features(when internals=True, D>config."distance"), @@ -266,7 +263,7 @@ def annotation_process(input_args, peak_file, log=None): if internal[mv_pos] in ["PeakInsideFeature", "FeatureInsidePeak"] and min_val != []: Best_hits_tab[j][peak['id']] = hit_line[ mv_pos] + "\n" - #logger.debug("\nBest Internal hit to be recorded: {}\n".format(hit_line[mv_pos]+"\n")) + else: # if All_hits_tab[j][peak['id']] == "" : continue @@ -275,15 +272,15 @@ def annotation_process(input_args, peak_file, log=None): # Get them filled with NAs for a in range(len(queries)): if All_hits_tab[a][peak['id']] == "": - #log.debug("\nQuery {} didn't validate any hit ! All_hits will be filled in with NAs".format(a)) + #Query didn't validate any hit.All_hits will be filled in with NAs All_hits_tab[a][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ 'start'], str(peak['center']), peak['end'], NAsList_q, str(a) + "\n"])) if Best_hits_tab[a][peak['id']] == "": - #log.debug("\nQuery {} didn't validate any hit !Best hits will be filled in with NAs".format(a)) + #Query didn't validate any hit !Best hits will be filled in with NAs Best_hits_tab[a][peak['id']] = "\t".join(np.hstack([peak['name'], peak['chr'], peak[ 'start'], str(peak['center']), peak['end'], NAsList_q, str(a) + "\n"])) - # end.for_hit + #--- end.for_hit --- if not has_hits or True not in has_hits: # Write the Non Overlaps in the first dict,so it is saved only once. # log.debug ("\nPeak has no hits ! All queries Tables will be @@ -323,10 +320,10 @@ def annotation_process(input_args, peak_file, log=None): # Queries with NA: map(lambda qna : # All_hits_tab[qna][peak['id']], Q_NA) repl_q = min(QnotNA) # The query to be used instead of qO. - ##log.debug ("Priority Query hasn't given any hit, so it will be replaced by secondary Query's Hit: {} ".format (repl_q)) + ## When Priority_Query hasn't given any hit,it will be replaced by secondary Query's Hit :repl_q All_hits_tab[0][peak['id']] = Best_hits_tab[ 0][peak['id']] = "" - ##log.debug ("\nHits from Secondary Q to replace priority query: \n{} ".format(All_hits_tab[repl_q][peak['id']] )) + ## Hits from Secondary Q to replace priority query: All_hits_tab[repl_q][peak['id']] rest_QnotNA = [qn for qn in QnotNA if qn != repl_q] if any(rest_QnotNA): for qr in rest_QnotNA: @@ -337,11 +334,11 @@ def annotation_process(input_args, peak_file, log=None): qna][peak['id']] = "" if all(isNA): - #logg.debug("\nNo query has any Hit.No replacement of Priority Query possible-> NAs will be filled in the Output.") + # No query with Hit.No replacement of Priority Query possible. Output with NAs. TabInList_p = map(lambda l: All_hits_tab[l][ peak['id']], range(len(queries))) - ##log.debug("Hit lines for all queries are : {}".format(TabInList_p )) - # If all_hits_tab doesn't have all queries will have "" + # Hit lines for all queries are :TabInList_p + # If 'all_hits_tab' doesn't have any hit, all queries will have "". TabInList_p = [Tab for Tab in TabInList_p if Tab != ""] for j in enumerate(TabInList_p): @@ -353,7 +350,6 @@ def annotation_process(input_args, peak_file, log=None): peak['id']] = ovls.merge_queries(TabInList_p) Best_hits_tab[0][ peak['id']] = ovls.merge_queries(TabInList_p) - ##log.debug("\nQueries hits for this peak to be merged in the Q.col:{}".format(ovls.merge_queries(TabInList_p)) ) for r in rest_q: All_hits_tab[r][peak['id']] = Best_hits_tab[ r][peak['id']] = "" diff --git a/uropa/config.py b/uropa/config.py index f44f295..f1cb082 100644 --- a/uropa/config.py +++ b/uropa/config.py @@ -34,7 +34,7 @@ def howtoconfig(): "gtf": "/path/to/annotation/file.gtf" } - Optionally, the priority and bigwig keys can be used to fine tune UROPAs behaviour: + Optionally, the priority key can be used to fine tune UROPAs behaviour: { "queries": [ diff --git a/uropa/overlaps.py b/uropa/overlaps.py index 3543b68..d95f29c 100644 --- a/uropa/overlaps.py +++ b/uropa/overlaps.py @@ -51,7 +51,7 @@ def tabix_index(annot_gtf): os.system('bgzip -c -f ' + annot_gtf + '.sorted ' + ' > ' + out_zipped) - run_tabix = 'tabix -f ' + out_zipped + run_tabix = 'tabix -f -p gff ' + out_zipped idx = subprocess.Popen(shlex.split(run_tabix), stdout=subprocess.PIPE) idx.wait() @@ -82,7 +82,7 @@ def valid_strand(hit_str, p_str, q_str): def valid_attribute(attr_filter_key, attr_filter_val, hit): - """Validates the hit accoring to a filter attribute.""" + """Validates the hit according to a filter attribute.""" if (attr_filter_key != "None") and (attr_filter_val != "None"): try: # If key for filtering is not correct or doesn't exist-> error