diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7cf34e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,206 @@ +# Created by .ignore support plugin (hsz.mobi) +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests +### R template +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +/*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html +rsconnect/ +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +/bin/3.1_create_gtf/data/ diff --git a/README.md b/README.md index 1aa01e6..75279af 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # masterJLU2018 -De novo motif discovery and evaluation based on footprints identified by TOBIAS +De novo motif discovery and evaluation based on footprints identified by TOBIAS. -For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) +For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki). ## Dependencies * [conda](https://conda.io/docs/user-guide/install/linux.html) @@ -10,35 +10,20 @@ For further information read the [documentation](https://github.molgen.mpg.de/lo * [MEME-Suite](http://meme-suite.org/doc/install.html?man_type=web) ## Installation -Start with installing all dependencies listed above. It is required to set the [enviroment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar). +Start with installing all dependencies listed above (Nextflow, conda, MEME-Suite) and downloading all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). +It is required to set the [enviroment paths for meme-suite](http://meme-suite.org/doc/install.html?man_type=web#installingtar). this can be done with following commands: ``` export PATH=[meme-suite instalation path]/libexec/meme-[meme-suite version]:$PATH export PATH=[meme-suite instalation path]/bin:$PATH ``` - -Download all files from the [GitHub repository](https://github.molgen.mpg.de/loosolab/masterJLU2018). -The Nextflow-script needs a conda enviroment to run. Nextflow can create the needed enviroment from the given yaml-file. -On some systems Nextflow exits the run with following error: -``` -Caused by: - Failed to create Conda environment - command: conda env create --prefix --file env.yml - status : 143 - message: -``` -If this error occurs you have to create the enviroment before starting the pipeline. -To create this enviroment you need the yml-file from the repository. -Run the following commands to create the enviroment: -```console -path=[Path to given masterenv.yml file] -conda env create --name masterenv -f=$path -``` -When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. +Every other dependency will be automatically installed by Nextflow using conda. For that a new conda enviroment will be created, which can be found in the from Nextflow created work directory after the first pipeline run. +It is **not** required to create and activate the enviroment from the yaml-file beforehand. **Important Note:** For conda the channel bioconda needs to be set as highest priority! This is required due to two differnt packages with the same name in different channels. For the pipeline the package jellyfish from the channel bioconda is needed and **NOT** the jellyfisch package from the channel conda-forge! + ## Quick Start ```console nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] @@ -52,14 +37,16 @@ Required arguments: --genome_fasta Path to genome in FASTA-format --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file - --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. - Path can be set as tfbs_path in next run. (Default: './') - --out Output Directory (Default: './out/') - + --organism Input organism [hg38 | hg19 | mm9 | mm10] + --out Output Directory (Default: './out/') + Optional arguments: - + --help [0|1] 1 to show this help message. (Default: 0) --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') + --gtf_path Path to gtf-file. If path is set the process which creats a gtf-file is skipped. Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) @@ -99,12 +86,28 @@ Optional arguments: --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON config All arguments can be set in the configuration files ``` +For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki). - -For further information read the [documentation](https://github.molgen.mpg.de/loosolab/masterJLU2018/wiki) +## Known issues +The Nextflow-script needs a conda enviroment to run. Nextflow creates the needed enviroment from the given yaml-file. +On some systems Nextflow exits the run with following error: +``` +Caused by: + Failed to create Conda environment + command: conda env create --prefix --file env.yml + status : 143 + message: +``` +If this error occurs you have to create the enviroment before starting the pipeline. +To create this enviroment you need the yml-file from the repository. +Run the following commands to create the enviroment: +```console +path=[Path to given masterenv.yml file] +conda env create --name masterenv -f $path +``` +When the enviroment is created, set the variable 'path_env' in the configuration file as the path to it. diff --git a/bin/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py similarity index 52% rename from bin/footprints_extraction.py rename to bin/1.1_footprint_extraction/footprints_extraction.py index bf43de2..3176162 100644 --- a/bin/footprints_extraction.py +++ b/bin/1.1_footprint_extraction/footprints_extraction.py @@ -1,6 +1,7 @@ +#!/usr/bin/env python """ -call_peaks uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest. +footprints_extraction script uses the uncontinuous score from a bigWig file to estimate footpints within peaks of interest. @author: Anastasiia Petrova @contact: anastasiia.petrova(at)mpi-bn.mpg.de """ @@ -16,11 +17,16 @@ import pyBigWig #to work with bigWig files import textwrap #to print the help message nice -logger = logging.getLogger('call_peaks') +#the logger should be callable from all functions, so create it here +logger = logging.getLogger('footprints_extraction') logger.setLevel(logging.INFO) formatter = logging.Formatter("%(asctime)s : %(message)s", "%Y-%m-%d %H:%M") +#the function parse_args() reads the input from user, including required and optional parameters +#if user defines no values for required parameters, the error will be printed and the script exits +#if user defines no values for optional parameters, the dafault values will be used +#this function returns a structure called args containing values for all parameters def parse_args(): parser = argparse.ArgumentParser(prog = '', description = textwrap.dedent(''' @@ -29,47 +35,55 @@ def parse_args(): '''), epilog='That is what you need to make this script work for you. Enjoy it') required_arguments = parser.add_argument_group('required arguments') - required_arguments.add_argument('--bigwig', help='a bigWig-file with scores', required=True) - required_arguments.add_argument('--bed', help='provide a file with peaks in .bed format', required=True) + required_arguments.add_argument('--bigwig', help='Please provide a path to the bigWig-file with scores.', required=True) + required_arguments.add_argument('--bed', help='Please provide a path to the file with peaks in .bed format.', required=True) #all other arguments are optional - #parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='output directory, by default ./output/') - parser.add_argument('--output_file', default='call_peaks_output.bed', type=str, help='enter a name for the output file, by default ./call_peaks_output.bed') - parser.add_argument('--window_length', default=200, type=int, help='enter the length for a window, by defauld 200 bp') - parser.add_argument('--step', default=100, type=int, help='enter a step to move the window, by default 100 bp') - parser.add_argument('--percentage', default=0, type=int, help='enter a percentage to be added to background while searching for footprints, by default 0%') - parser.add_argument('--silent', action='store_true', help='while working with data write the information only into ./call_peaks_log.log') + #parser.add_argument('--output_directory', default='output', const='output', nargs='?', help='The output directory, by default ./output/') + parser.add_argument('--output_file', default='footprints_extraction.bed', type=str, help='Please enter a name for the output file, by default ./footprints_extraction.bed.') + parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.') + parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.') + parser.add_argument('--percentage', default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.') + parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.') + parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.') args = parser.parse_args() return args -#this function is currently unused, but may be used in the future version +#this function checks if the directory for the output files exists and if not, the new directory with default name will be created +#the input for this function is a directory to check def check_directory(directory): if not os.path.exists(directory): os.makedirs(directory) - print('a new directory ' + directory + ' was created') + logger.info('The new directory ' + directory + ' was created!') #check if the file to remove exists and if so, delete it +#if the file does not exist, nothing happens def remove_file(file): if os.path.isfile(file): os.remove(file) -#check if the input files exist. We have no possibility to check if the format of the files is right here though +#check if the input files exist, we will check for the right format of input files later on +#the input parameter args is a structure from parse_args, containing the input parameters +#if input files dont exist, the short message will be printed and the exit is forced def check_existing_input_files(args): #check if the bigWig file exists if not os.path.isfile(args.bigwig): - print('there is no such bigWig file ' + args.bigwig + ', the exit is forced') + print('Error: there is no such bigWig file ' + args.bigwig + ', the exit is forced!') sys.exit() #check if the bed file exists if not os.path.isfile(args.bed): - print('there is no such bed file ' + args.bed + ', the exit is forced') + print('Error: there is no such bed file ' + args.bed + ', the exit is forced!') sys.exit() #make a dictionary out of the .bed file for easy access to the information from the .bed file +#the input for this function is a path to the bed file +#the output is a dictionary containing key and values from the given bed file +#key looks like chr1:123-234, the value contains all the additional information from the bed file def make_bed_dictionary(bed_file): - logger.info('reading of the bed file') + logger.info('Reading of the bed file...') try: #if i can't proceede this file like so, the input was not a .bed file! bed_dictionary = {} @@ -83,9 +97,12 @@ def make_bed_dictionary(bed_file): for i in range(3, len(bed_line_array)): value.append(bed_line_array[i]) + if not value: #if there is no bonus information in the original bed file, add a "." to the coulmn in the output bed file + value = ["."] + bed_dictionary[key] = value else: #this is not a bed file, force exit - logger.info('please make sure the input bed file has a right format, the problem occured on the line ' + bed_line) + logger.info('Error: please make sure the input bed file has a right format, the problem occured on the line ' + bed_line) sys.exit() read_bed_file.close() @@ -93,20 +110,25 @@ def make_bed_dictionary(bed_file): return bed_dictionary except UnicodeDecodeError: #force exit, if input is not a .bed file - logger.info('please make sure that the .bed file has a right format! The exit is forced') + logger.info('Error: please make sure that the .bed file has a right format! The exit is forced!') sys.exit() #to save a footprint, find the score and max_pos for this footprint and check for overlapping with other footprints within the current peak +#the input for this function is the count for footprint to ensure unique name for each saved footprint; +#an array containing scores (signals) from the bigwig file; the footprints already saved for the particular peak; information for the bed file: +#chromosom, start and end position, as well as additional information from the original bed file +#the function returns the count for footprints, as well as footprints for the current peak def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed): save_current_footprint = False - if len(footprint_scores) > 2: #exclude small regions to not work with them - + #make sure we are working only with regions bigger than 2, in other case we have not enough information to find the footprint + if len(footprint_scores) > 2: + #calculate the position with the max score. if there are many positions with the same score, save one from the middle - first_max_pos = footprint_scores.index(max(footprint_scores)) - last_max_pos = first_max_pos #assume that there is only one pos with max score + #assume that there is only one pos with max score + last_max_pos = first_max_pos #find the region with the highest score for j in range(first_max_pos, len(footprint_scores)): @@ -134,26 +156,31 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom old_start = peak_footprints[existing_footprint_name]['start'] old_end = peak_footprints[existing_footprint_name]['end'] old_score = peak_footprints[existing_footprint_name]['score'] + old_length = peak_footprints[existing_footprint_name]['len'] if footprint_start >= old_start and footprint_start <= old_end: #the start of the new footprint is between the start and end of an old footprint if footprint_end > old_end: #the new footprint is not completely inside the old one #update the information about the existing footprint - footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 #find the average of both scores + #find the average of both scores + footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 peak_footprints[existing_footprint_name]['end'] = footprint_end peak_footprints[existing_footprint_name]['score'] = footprint_score + peak_footprints[existing_footprint_name]['len'] = footprint_end - old_start #we can not update the max_pos as we do not have the information about scores array of the existing footprint #else: the new footprint is completely inside the old one, do nothing save_current_footprint = False break elif footprint_end >= old_start and footprint_end <= old_end: #the end of the new footprint is between the start and end of an old footprint - if footprint_start < old_start: #the new footprint is not completely inside the old one + if footprint_start < old_start: #the new footprint is not completely inside the old one #update the information about the existing footprint - footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 #find the average of both scores + #find the average of both scores + footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2 peak_footprints[existing_footprint_name]['start'] = footprint_start peak_footprints[existing_footprint_name]['score'] = footprint_score + peak_footprints[existing_footprint_name]['len'] = old_end - footprint_start #else do nothing save_current_footprint = False break @@ -169,7 +196,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom if save_current_footprint == True: #make sure to save this footprint - footprint_name = "footprint_" + str(footprint_count) + footprint_name = "f_" + str(footprint_count) peak_footprints[footprint_name] = peak_footprints.get(footprint_name, {}) peak_footprints[footprint_name] = {'chromosom': chromosom, 'start': footprint_start, 'end': footprint_end, 'score': footprint_score, 'len': len(footprint_scores), 'bonus': bonus_info_from_bed, 'max_pos': max_pos} @@ -180,12 +207,17 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom return footprint_count, peak_footprints -#apply slide window searching to estimate regions that are higher than the background and can possibly be footprints +#this function uses a slide window algorithm to estimate regions where the signal is higher than the background and can possibly be footprints +#as an input this function takes: the footprints already saved for the current peak; the count for footprints to ensure unique name; +#the information for the bed file (chromosom, start- and end-positions of peak); an array with scores within the peak; the length of the window; +#as well as the dictionary containing the information from the original bed file and the optional parameters such as step to slide the window and percentage to change the background +#this function returns the array containing the found footprints within a peak as well as the footprint count to look for footprints in the next peak and ensure the unique names for footprints def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary_entry, step, percentage): - + + #find the length of the current peak peak_len = len(scores_in_peak) - parts = [] - parts_positions = [] + parts = [] #an array to save scores of parts + parts_positions = [] #an array to save where each part begins in the original peak #if necessary divide the peak with help of a sliding window if peak_len <= window_length: @@ -243,12 +275,66 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe return peak_footprints, footprint_count -#use the information provided from the .bed file to look for footprints within the peaks of interest -def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage): - - logger.info('looking for footprints within peaks') - - try: +#this function checks if there are footprints with a small amount of bp in between, and if so, merges them together +#if at this point there are still overlaps of footprints, this function will delete them +#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints +#the output is the renewed dictionary containing only the best footprints for the output file +def check_and_merge(peak_footprints, max_bp_between): + peak_footprints_new = {} + + for footprint_to_check in peak_footprints.keys(): + start_to_check = peak_footprints[footprint_to_check]['start'] + end_to_check = peak_footprints[footprint_to_check]['end'] + + merge_footprints_left = None + + for compared_footprint in peak_footprints.keys(): + + if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between: + #make compared_footprint longer + merge_footprints_left = False + break + elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between: + #make footprint_to_check longer + merge_footprints_left = True + break + + if merge_footprints_left: #if the merging left is enabled + #check if this footprint can be merged with the compared_footprint + #if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now + if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): + #update the start position + peak_footprints_new[compared_footprint]['start'] = start_to_check + #update the length + peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check + #update the score + peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 + + elif merge_footprints_left == False: #otherwise merge right + #check if the merging is possible + if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys(): + #update the end position + peak_footprints_new[compared_footprint]['end'] = end_to_check + #update the length + peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end'] + #update the score + peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2 + + else: #save the current footprint, as it should not be merged + peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, []) + peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] + + return peak_footprints_new + +#this function uses the information provided from the .bed file to look for footprints within the peaks of interest +#as input the information from the original bed file, as well as bigwig file is needed +#the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score +#the output of this function is a dictionary contains all the found footprints ready to write to the output file +def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between): + + logger.info('Looking for footprints within peaks...') + + try: #this try considers opening the bigwig file footprint_count = 1 all_footprints = {} @@ -265,9 +351,11 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage) peak_end = int(positions[1]) scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array - peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage) + #double check for overlaps and possibly merging of footprints having up to 5 bp in between + peak_footprints = check_and_merge(peak_footprints, max_bp_between) + for footprint_name in peak_footprints.keys(): all_footprints[footprint_name] = all_footprints.get(footprint_name, {}) all_footprints[footprint_name] = peak_footprints[footprint_name] @@ -277,33 +365,49 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage) return all_footprints except RuntimeError: #if i can't work with the bigwig file like so, it was not a bigwig file! - logger.info('please make sure that the .bigWig file has a right format! The exit is forced') + logger.info('Error: please make sure that the .bigWig file has a right format! The exit is forced!') sys.exit() -#write the found footprints to the .bed file +#this function writes the found footprints to the .bed file +#as input the dictionary with all footprints and the name for the output file are needed +#the function outputs first an unsorted file and finally sorts it, removing the unsorted one def write_to_bed_file(all_footprints, sorted_output_file_name): - output_file_name = "not_sorted_" + sorted_output_file_name #save in the working directory + + #extract the name of directory where we want to save the file + output_directory = os.path.dirname(sorted_output_file_name) + #check if there is a directory provided + if output_directory: + check_directory(output_directory) + + #make sure the file is in .bed format + output_name = "not_sorted_" + os.path.splitext(os.path.basename(sorted_output_file_name))[0] + ".bed" - header = ["#chr", "start", "end", "name", "score", "len", "max_pos", "bonus_info"] #a header to know what is in the columns + #save the output file in the working directory or in the directory provided by the user + output_file_name = (os.path.join(output_directory, output_name)) + + #a header to know what is in the columns + header = ["#chr", "start", "end", "name", "score", "strand", "len", "max_pos", "bonus_info"] - output_file = open(output_file_name, 'w') #open a file to write + #open a file to write + output_file = open(output_file_name, 'w') - logger.info("print to the output file") + logger.info("Printing to the output file...") - output_file.write('\t'.join(header) + '\n') #write the header + #write the header + output_file.write('\t'.join(header) + '\n') #write each footprint line for line to the output file for footprint in all_footprints: - output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), str(footprint[1]['len']), str(footprint[1]['max_pos']), '\t'.join(footprint[1]['bonus'])]) + '\n') + output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n') output_file.close() #sort the bed file - logger.info('sorting the output file') + logger.info('Sorting the output file...') os.system("(head -n 2 " + output_file_name + " && tail -n +3 " + output_file_name + " | sort -k1,1V -k2,2n -k3,3n) > " + sorted_output_file_name) - logger.info('remove the non-sorted file') + logger.info('Removing the non-sorted file...') remove_file(output_file_name) @@ -317,8 +421,8 @@ def main(): #check if there is an existing directory that user gave as input, otherwise create this directory from the path provided from the user #check_directory(args.output_directory) - #fh = logging.FileHandler(os.path.join(args.output_directory, "call_peaks_log.log")) - fh = logging.FileHandler("call_peaks_log.log") + #fh = logging.FileHandler(os.path.join(args.output_directory, "footprints_extraction.log")) + fh = logging.FileHandler("footprints_extraction.log") fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) @@ -332,16 +436,16 @@ def main(): if args.silent: logger.removeHandler(ch) - logger.info("call_peaks.py was called using these parameters: " + str(vars(args))) + logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args))) bed_dictionary = make_bed_dictionary(args.bed) - all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage) + all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between) write_to_bed_file(all_footprints, args.output_file) logger.info("the number of peaks: " + str(len(bed_dictionary))) logger.info("the number of footprints: " + str(len(all_footprints))) - logger.info("call_peaks needed %s minutes to generate the output" % (round((time.time() - start)/60, 2))) + logger.info("It took footprints_extraction.py %s minutes to generate the output." % (round((time.time() - start)/60, 2))) for handler in logger.handlers: handler.close() diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh new file mode 100644 index 0000000..38027f3 --- /dev/null +++ b/bin/1.2_filter_motifs/compareBed.sh @@ -0,0 +1,289 @@ +#!/bin/bash + +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com + +# desciption: This scripts uses bedtools to find new, non overlapping regions +# between input data in BED-format and a group of known motifs in +# BED-format. + +# For parameter description, run the script without parameters or -h. +# The output is a file with the filtered footprints and the log file FilterMotivs.stats + +# One R script is used, compareBed_runinfo.R, stored in the same directory. + +# default parameters +workdir=$PWD +output="newMotifs.bed" +min=10 +max=200 +help=false + +path=`echo $0 | sed 's/\/[^\/]*$/\//g'` + +# display help when no parameters chosen +if [ $# -eq 0 ] +then + help=true +fi + +# parsing parameters +wrong=() +while [[ $# -gt 0 ]] +do + key="$1" + if [[ ${2:0:1} == "-" ]] + then + echo "ERROR: Each parameter needs a value (except the help parameter), values must not start with a '-'!" + exit 1 + fi + + case $key in + -d|--data) + data="$2" + shift + shift + ;; + -m|--motifs) + motifs="$2" + shift + shift + ;; + -w|--workdir) + workdir="$2" + shift + shift + ;; + -f|--fasta) + fasta="$2" + shift + shift + ;; + -min|--min) + min=$2 + shift + shift + ;; + -max|--max) + max=$2 + shift + shift + ;; + -o|--output) + output="$2" + shift + shift + ;; + -h|--help) + help=true + shift + ;; + *) + wrong+=("$1") + shift + ;; + esac +done + +# if wrong parameters were chosen.. +count=${#wrong[@]} +if [ $count -gt 0 ] +then + for i in ${wrong[@]} + do + echo ERROR: wrong parameter $i + done + + echo call script without parameters for help or call --help + exit 1 +fi + +# the help message +if [ $help == true ] +then + echo "This script utilizes bedtools to select new footprints from data." + echo "Therefore the data is compared with known footprint motifs." + echo "The output is a new BED-file with added sequence information and a flag contains_maxpos (1/0)" + echo "Required arguments are data, motifs, both in bed-format and the fasta genome sequences." + echo "If a parameter is chosen, a value must be provided aswell. (exception -h)" + echo "--------------------" + echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." + echo "--------------------" + echo " required parameter:" + echo " -d --data the path to the .bed file of the footprints" + echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" + echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" + echo " -f --fasta the path to the .fasta file of genome" + echo " " + echo " optional parameter:" + echo " -w --workdir the path to directory where temporary files will be created" + echo " default is the current directory" + echo " -min --min minimum size of footprints; default is 10" + echo " -max --max maximum size of footprints; default is 80" + echo " -o --output output path/file ; default dir current directory and filename is newMotifs.bed and newMotifs.bed.fasta" + echo " -h --help shows this help message" + exit 0 +fi + +# summary of parameters +echo selected parameters +echo ------------------- +echo data: $data \(required\) +echo motifs: $motifs \(required\) +echo workdir: $workdir +echo fasta: $fasta \(required\) +echo min: $min +echo max: $max +echo output: $output + +# check required parameters +if [ -z $data ] || [ -z $motifs ] || [ -z $fasta ] +then + echo ERROR + echo required parameters not given. + echo required are: --data \ --motifs \ --fasta \ + exit 1 +fi +if [ ! -f $data ] +then + echo ERROR + echo $data does not exist. Please check input parameter -d / --data + exit 1 +fi +if [ ! -f $fasta ] +then + echo ERROR + echo $fasta does not exist. Please check input parameter -f / --fasta + exit 1 +fi +#check other parameters +if [ $min -lt 0 ] +then + min=10 + echo "min can't be negative. Default value of 10 is choosen" +fi +if [ $max -lt $min ] +then + max=200 + echo "max must be greater than min. Default value of 200 is choosen" +fi +if [ ! -d $workdir ] +then + echo ERROR + echo "directory $workdir does not exist. Please check parameter -w / --workdir" + exit 1 +fi + +# remove trailing tabs in footprints +cat $data | sed 's/[ \t]*$//' > "$workdir"/filtered.bed + +# motiffiles either from a directory OR comma separated list +if [ -d "$motifs" ] +then + # creates an array of all files with bed in its name in the directory $motifs + declare -a motiffiles=(`ls $motifs | grep bed | sed "s|^|$motifs\/|g" | tr '\n' ' ' | sed "s|//|/|g"`) + +# the else case means, that the motiffiles were passed comma separated with no whitespace. +else + declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) +fi + +# check if files exist and if they are all empty (exiting if all empty) +all_empty=true +for i in ${motiffiles[@]} +do + if [ -f $i ] + then + if [ $all_empty == true ] + then + lines=`cat $i | wc -l` + if [ $lines -gt 1 ] + then + all_empty=false + break + fi + fi + else + echo ERROR: file $i does not exist + echo please use correct paths. exiting. + exit 1 + fi +done + +# error report of rare case of only empty motiffiles +if [ $all_empty == true ] + then + echo ERROR + echo All motiffiles have less than 2 lines! + echo Fix motiffiles and try again. + exit 1 +fi + +# comparing unknown footprints with regions of known motifs +# comparison is done iteratively +# remove overlapping regions in unknown footprints +temp_switch=true +counter=1 +for i in ${motiffiles[@]} +do + echo "$i --- $counter of ${#motiffiles[@]}" + # remove trailing tabs in motiffile, but only if the second line in the file ends with a tab. + # checking all lines for trailing tabs would be time consuming. + secnd_line=`sed -n 2p $i | tr '\t' '#'` + if [[ ${secnd_line: -1} == "#" ]] + then + echo trailing tabs have been found. removing trailing tabs. + sed -i 's/[ \t]*$//' $i + fi + + # bedtools comparisons + if [ $temp_switch == true ] + then + temp_switch=false + bedtools subtract -a "$workdir"/filtered.bed -b $i > "$workdir"/filtered_temp.bed + else + temp_switch=true + bedtools subtract -a "$workdir"/filtered_temp.bed -b $i > "$workdir"/filtered.bed + fi + counter=`expr $counter + 1` +done + +# get file of last iteration an write its content into filtered.bed +if [ $temp_switch == false ] +then + cat "$workdir"/filtered_temp.bed > "$workdir"/filtered.bed +fi + +# remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition +# also creates a small output file with information about the comparison +Rscript $path/compareBed_runinfo.R $min $max $data "$workdir"/filtered.bed "$workdir"/filtered_flagged.bed "$workdir"/FilterMotifs.stats +# check if Rscript executed without errors +if [ $? -gt 0 ] +then + exit 1 +fi + +# check if header existed. If so, final output also has a header. +first_line=`sed -n 1p $data | sed "s/$/\tcontains_maxpos\tsequence/"` +if [[ ${first_line:0:1} == "#" ]] +then + echo "$first_line" > $output + # add some final values to the log file + fp_initial=`cat $data | wc -l` + fp_initial=`expr $fp_initial - 1` + fp_final=`cat "$workdir"/filtered.bed | wc -l` + fp_final=`expr $fp_final - 1` + echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats + echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats +else + # output will be overwritten if it exists + rm -f $output + # add some final values to the log file + cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/FilterMotifs.stats + cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/FilterMotifs.stats +fi + +# add fasta sequences to bed and create fasta file +echo getting sequences from fasta-file +bedtools getfasta -fi $fasta -bed "$workdir"/filtered_flagged.bed -bedOut >> $output +bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta diff --git a/bin/1.2_filter_motifs/compareBed_runinfo.R b/bin/1.2_filter_motifs/compareBed_runinfo.R new file mode 100644 index 0000000..4888c02 --- /dev/null +++ b/bin/1.2_filter_motifs/compareBed_runinfo.R @@ -0,0 +1,79 @@ +#!/bin/Rscript + +# author: Jannik Hamp +# email: jannik.hamp@googlemail.com + +# desciption: This script gets called by compareBed.sh +# It removes too small/large footprints, makes names unique, +# adds a column with a flag "contains_maxpos" +# and creates a file with information of the bedtool comparison + +# parameters: Parameters are not named. Respect the parameter order. +# min: minimum footprint size threshold +# max: maximum footprint size threshold +# input_raw: unfiltered BED-file +# input_filtered: filtered BED-file (after bedtools subtract) +# output: output path/file +# output_stats: output file with general info of the comparison + +# parsing parameters +library(data.table) +args <- commandArgs(TRUE) +min <- as.numeric(args[1]) +max <- as.numeric(args[2]) +input_raw <- args[3] +input_filtered <- args[4] +output <- args[5] +output_stats <- args[6] + +data_filtered <- fread(input_filtered, sep='\t') + +# check if data has less than 9 columns +if (ncol(data_filtered) < 9) { + stop("footprint file has less than 9 columns. exiting.") +} + +# remove sequences that are smaller than minimum (parameter) +# remove sequences that are longer than maximum (parameter) +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] >= min),] +data_filtered <- data_filtered[which(data_filtered[[3]] - data_filtered[[2]] <= max),] + +# make unique names and adjust length for splitted footprints +# duplicated names have .0 , .1 , .2 ... added +names <- data_filtered[[4]] +names(data_filtered)[4] <- "name" +# all duplicated names +duplicants <- unique(data_filtered[duplicated(name)][[4]]) +data_filtered[[4]] <- make.unique(as.character(data_filtered[[4]])) +data_filtered[match(duplicants, names), name := paste0(name,".0")] + +# recalculate length of sequences +data_filtered[[7]] <- data_filtered[[3]] - data_filtered[[2]] + +# adding column "contains_maxpos", containing flag (0 or 1) +# max_pos is the position of maximum score of a footprint +data_filtered <- cbind(data_filtered, contains_maxpos = 0) +data_filtered$contains_maxpos[intersect(which(data_filtered[[2]] <= data_filtered[[8]]), which(data_filtered[[3]] > data_filtered[[8]]))] = 1 +data_filtered[[8]] <- data_filtered[[8]] - data_filtered[[2]] + +fwrite(data_filtered, output, col.names=FALSE, quote = FALSE, sep = '\t') + +# data is the initial data before any comparisons have been done (-d parameter of compareBed.sh) +data <- fread(input_raw, sep='\t') + +# some statistics about the bedtool comparisons are stored in FilterMotifs.stats +# number of nucleotides input +sum_data <- sum(data[[3]]-data[[2]]) +# number of nucleotides after filter +sum_filtered <- sum(data_filtered[[7]]) +# quotient: sum_data/sum_filtered +difference_nt <- formatC(sum_data/sum_filtered, digits = 4) +# loss: 1 - sum_filtered/sum_data +loss_nt <- formatC(1 - sum_filtered/sum_data, digits = 2) +# mean length of footprints input +length_data <- formatC(mean(data[[3]]-data[[2]]), digits = 4) +# mean -ength of footprints after filter +length_filtered <- formatC(mean(data_filtered[[7]]), digits = 4) +stats <- data.frame(sum_nt_input = sum_data, sum_nt_filtered = sum_filtered, quotient_of_nt = difference_nt, loss_of_nt = loss_nt, mean_length_input = length_data, mean_length_filtered = length_filtered, flag_1_ratio = length(which(data_filtered$containsMaxpos == 1))/dim(data_filtered)[1]) +stats <- t(stats) +write.table(stats, output_stats, col.names = FALSE, quote = FALSE, sep = '\t') diff --git a/bin/tfbsscan.py b/bin/1.2_filter_motifs/tfbsscan.py similarity index 100% rename from bin/tfbsscan.py rename to bin/1.2_filter_motifs/tfbsscan.py diff --git a/bin/cdhit_wrapper.R b/bin/2.1_clustering/cdhit_wrapper.R similarity index 75% rename from bin/cdhit_wrapper.R rename to bin/2.1_clustering/cdhit_wrapper.R index e5ed3e4..ebf7667 100644 --- a/bin/cdhit_wrapper.R +++ b/bin/2.1_clustering/cdhit_wrapper.R @@ -1,10 +1,10 @@ #! /bin/Rscript -library("optparse") +if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse) option_list <- list( make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Fourth column is expected to contain names, last column must be sequences.", metavar = "character"), make_option(opt_str = c("-c", "--identity"), default = 0.8, help = "Identity threshold. Default = %default (CD-HIT parameter)", metavar = "double >= 0.8"), - make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucelotides. Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-A", "--coverage"), default = 8, help = "Minimal alignment length for both sequences in nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-o", "--output"), default = "cluster.bed", help = "Output file same as input but with appended column of cluster numbers. Default = %default", metavar = "character"), make_option(opt_str = c("--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), make_option(opt_str = c("-G", "--global"), default = 0, help = "Global sequence identity = 1, local = 0. Default = %default (CD-HIT parameter)", metavar = "integer"), @@ -14,7 +14,7 @@ option_list <- list( make_option(opt_str = c("-n", "--word_length"), default = 3, help = "Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-l", "--throw_away_sequences"), default = 5, help = "Maximum length of sequences thrown away. Default = %default (CD-HIT parameter)", metavar = "integer"), # make_option(opt_str = c("-d", "--description")), # can not produce bed if this is != 0 - make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequenecs length. Default = %default (CD-HIT parameter)", metavar = "double"), + make_option(opt_str = c("-s", "--length_dif_cutoff_shorter_p"), default = 0.0, help = "Shorter sequences length must be at least x percent of longer sequences length. Default = %default (CD-HIT parameter)", metavar = "double"), make_option(opt_str = c("-S", "--length_dif_cutoff_shorter_n"), default = 999999, help = "Length difference between sequences can not be higher than x nucleotides. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-e", "--alignment_coverage_longer_p"), default = 0.0, help = "Alignment must cover x percent of longer sequence. Default = %default (CD-HIT parameter: -aL)", metavar = "double"), make_option(opt_str = c("-E", "--alignment_coverage_longer_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AL)", metavar = "integer"), @@ -22,18 +22,20 @@ option_list <- list( make_option(opt_str = c("-F", "--alignment_coverage_shorter_n"), default = 99999999, help = "There can not be more than x unaligned nucleotides on the longer sequence. Default = %default (CD-HIT parameter: -AS)", metavar = "integer"), make_option(opt_str = c("-w", "--max_unmatched_longer_p"), default = 1.0, help = "Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uL)", metavar = "double"), make_option(opt_str = c("-W", "--max_unmatched_shorter_p"), default = 1.0, help = "Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = %default (CD-HIT parameter: -uS)", metavar = "double"), - make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), + make_option(opt_str = c("-U", "--max_unmatched_both_n"), default = 99999999, help = "Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-g", "--fast_cluster"), default = 1, help = "Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("-r", "--strand"), default = 0, help = "Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--match"), default = 2, help = "Matching score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--mismatch"), default = -2, help = "Mismatch score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap"), default = -6, help = "Gap score. Default = %default (CD-HIT parameter)", metavar = "integer"), make_option(opt_str = c("--gap_ext"), default = -1, help = "Gap extension score. Default = %default (CD-HIT parameter)", metavar = "integer"), - make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer") + make_option(opt_str = c("-x", "--sort_cluster_by_size"), default = 1, help = "Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = %default (CD-HIT parameter: -sc)", metavar = "integer"), + make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character") ) opt_parser <- OptionParser(option_list = option_list, - description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.") + description = "CD-HIT-EST Wrapper function. See https://github.com/weizhongli/cdhit/wiki for more information regarding CD-HITs parameters.", + epilogue = "Author: Hendrik Schultheis ") opt <- parse_args(opt_parser) @@ -42,8 +44,8 @@ opt <- parse_args(opt_parser) #' #' @param input Data.table or file in bed format (requires names in fourth column and sequences in last column). #' @param identity Identity threshold. Default = 0.8 (CD-HIT parameter) -#' @param coverage Minimal alignment length for both sequences in nucelotides. Default = 8 (CD-HIT parameter) -#' @param output Clustered bedfile. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed +#' @param coverage Minimal alignment length for both sequences in nucleotides. Default = 8 (CD-HIT parameter) +#' @param output Clustered bed-file. Adds cluster number in last column (numbering depend on sort_by_cluster_size parameter). Default = cluster.bed #' @param clean Clean up after run. Default = TRUE #' @param threads Number of threads to use (0 = all cores). Default = 1 (CD-HIT parameter) #' @param global Global sequence identity = 1, local = 0. Default = 0 (CD-HIT parameter) @@ -51,7 +53,7 @@ opt <- parse_args(opt_parser) #' @param memory Memory limit in MB. 0 for unlimited. Default = 800 (CD-HIT parameter) #' @param word_length Default = 3 (CD-HIT parameter) #' @param throw_away_sequences Maximum length of sequences thrown away. Default = %default (CD-HIT parameter) -#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequenecs length. Default = 0 (CD-HIT parameter) +#' @param length_dif_cutoff_shorter_p Shorter sequences length must be at least x percent of longer sequences length. Default = 0 (CD-HIT parameter) #' @param length_dif_cutoff_shorter_n Length difference between sequences can not be higher than x nucleotides. Default = 999999 (CD-HIT parameter) #' @param alignment_coverage_longer_p Alignment must cover x percent of longer sequence. Default = 0 (CD-HIT parameter) #' @param alignment_coverage_longer_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) @@ -59,33 +61,54 @@ opt <- parse_args(opt_parser) #' @param alignment_coverage_shorter_n There can not be more than x unaligned nucleotides on the longer sequence. Default = 99999999 (CD-HIT parameter) #' @param max_unmatched_longer_p Maximum unmatched percentage on longer sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) #' @param max_unmatched_shorter_p Maximum unmatched percentage on shorter sequence (excludes leading tailing gap). Default = 1 (CD-HIT parameter) -#' @param max_unmatched_both_n Maximum unmatched nucleotied on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) +#' @param max_unmatched_both_n Maximum unmatched nucleotide on both sequences (excludes leading tailing gap). Default = 99999999 (CD-HIT parameter) #' @param fast_cluster Cluster sequence in first cluster that meets threshold = 0 (fast). Or cluster in best Cluster = 1 (accurate). Default = 1 (CD-HIT parameter) #' @param strand Align +/+ & +/- (= 1). Or align only +/+ (= 0). Default = 0 (CD-HIT parameter) #' @param match Matching score. Default = 2 (CD-HIT parameter) #' @param mismatch Mismatch score. Default = -2 (CD-HIT parameter) #' @param gap Gap score. Default = -6 (CD-HIT parameter) -#' @param gat_ext Gap extension score. Default = -1 (CD-HIT parameter) +#' @param gap_ext Gap extension score. Default = -1 (CD-HIT parameter) #' @param sort_cluster_by_size Either sort cluster by decreasing length (= 0) or by decreasing size (= 1). Default = 1 (CD-HIT parameter) +#' @param summary Filepath where a small summary file of the run should be written. #' -#' TODO check whether cdhit is installed -cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1) { +#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept and extended. +#' +#' @author Hendrik Schultheis +#' +cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed", clean = TRUE, threads = 1, global = 0, band_width = 20, memory = 800, word_length = 3, throw_away_sequences = 5, length_dif_cutoff_shorter_p = 0, length_dif_cutoff_shorter_n = 999999, alignment_coverage_longer_p = 0, alignment_coverage_longer_n = 99999999, alignment_coverage_shorter_p = 0, alignment_coverage_shorter_n = 99999999, max_unmatched_longer_p = 1, max_unmatched_shorter_p = 1, max_unmatched_both_n = 99999999, fast_cluster = 1, strand = 0, match = 2, mismatch = -2, gap = -6, gap_ext = -1, sort_cluster_by_size = 1, summary = NULL) { + # parameter checks + if (system("which cd-hit-est", ignore.stdout = FALSE) != 0) { + stop("Required program CD-HIT not found! Please check whether it is installed.") + } + if (missing(input)) { - stop("Input parameter missing!") + stop("No input specified! Please forward a valid bed-file.") + } + + if (!file.exists(input)) { + stop("File ", input, " does not exist!") + } + + if (!is.logical(clean)) { + stop("Parameter clean has to be a boolean value.") } message("Loading bed.") - # load bed if neccessary + # load bed if necessary if (!data.table::is.data.table(input)) { - bed_table <- data.table::fread(input = input, header = FALSE) + bed_table <- data.table::fread(input = input) } else { bed_table <- input } + # check if there are column names to keep them + default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) + keep_col_names <- ifelse(any(default_col_names), FALSE, TRUE) + # check for duplicated names - if (anyDuplicated(bed_table[, "V4"])) { + if (anyDuplicated(bed_table[, 4])) { warning("Found duplicated names. Making names unique.") - bed_table[, V4 := make.unique(V4)] + bed_table[, 4 := make.unique(names(bed_table)[4])] } message("Convert to fasta.") @@ -137,22 +160,57 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed" system(command = cluster_call, wait = TRUE) # load reformated file - cluster <- data.table::fread(cluster_file) + cluster <- data.table::fread(cluster_file)[, c("id", "clstr")] + names(cluster) <- c("id", "cluster") ### add cluster to bed_table - result <- merge(x = bed_table, y = cluster[, c("id", "clstr")], by.x = "V4", by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] + result <- merge(x = bed_table, y = cluster, by.x = names(bed_table)[4], by.y = "id", sort = FALSE)[, union(names(bed_table), names(cluster)[2]), with = FALSE] # delete files if (clean) { file.remove(fasta_file, paste0(cdhit_output, ".clstr"), cdhit_output, cluster_file) } - data.table::fwrite(x = result, file = output, sep = "\t", col.names = FALSE) + # write summary + if (!is.null(summary)) { + # script/ function call + file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())]) + call <- ifelse(interactive(), deparse(match.call()), + paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " ")) + ) + + total_cluster <- length(unique(result[[ncol(result)]])) + + # cluster size table + # columns: cluster_id, size + cluster_table <- data.table::as.data.table(table(result[[ncol(result)]])) + names(cluster_table) <- c("cluster_id", "size") + + summary_data <- c( + "Call:", + paste0("\t", call), + "", + "Cluster:", + paste("\tTotal", total_cluster), + "" + ) + + write(x = summary_data, file = summary) + data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE) + } + + data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names) } # call function with given parameter if not in interactive context (e.g. run from shell) if (!interactive()) { # remove last parameter (help param) - params <- opt[-length(opt)] - do.call(cdhitest, args = params) + + # show help if called without arguments + if (length(commandArgs(trailingOnly = TRUE)) <= 0) { + print_help(opt_parser) + } else { + params <- opt[-length(opt)] + do.call(cdhitest, args = params) + } } diff --git a/bin/2.1_clustering/reduce_sequence.R b/bin/2.1_clustering/reduce_sequence.R new file mode 100644 index 0000000..15d99ae --- /dev/null +++ b/bin/2.1_clustering/reduce_sequence.R @@ -0,0 +1,369 @@ +#! /bin/Rscript +if (!require(optparse, quietly = TRUE)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), + make_option(opt_str = c("-k", "--kmer"), default = 10, help = "K-mer length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), + make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), + make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), + make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), + make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower.", metavar = "integer", type = "integer"), + make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1", metavar = "integer", type = "integer"), + make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), + make_option(opt_str = c("-f", "--motif_occurrence"), default = 1, help = "Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = %default meaning that there should be approximately one motif per sequence.", metavar = "double"), + make_option(opt_str = c("--summary"), default = NULL, help = "Filepath where a small summary file of the run should be written. Default = %default", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Reduces each sequence to its most frequent region.", + epilogue = "Author: Hendrik Schultheis ") + +opt <- parse_args(opt_parser) + +#' Reduces each sequence to its most frequent region. +#' +#' @param input Input bed-file. Last column must be sequences. +#' @param kmer k-mer length. Default = 10 +#' @param motif Estimated motif length. Default = 10 +#' @param output Output file. Default = reduced.bed +#' @param threads Number of threads to use. Default = 1. Use 0 for all cores. +#' @param clean Delete all temporary files. +#' @param minoverlap_kmer Minimum required overlap between k-mer. Used to create reduced sequence ranges out of merged k-mer. Can not be greater than k-mer length. Default = kmer - 1 +#' @param minoverlap_motif Minimum required overlap between motif and k-mer to consider k-mer significant. Used for k-mer cutoff calculation. Can not be greater than motif and k-mer length. Default = ceiling(motif / 2) +#' @param min_seq_length Remove sequences below this length. Defaults to the maximum value of motif and k-mer and can not be lower. +#' @param motif_occurrence Define how many motifs are expected per sequence. This value is used during k-mer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. +#' @param summary Filepath where a small summary file of the run should be written. +#' +#' @details If there is a header supplied other then the default data.table naming scheme ('V1', 'V2', etc.) it will be kept. +#' +#' @author Hendrik Schultheis +#' +reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = 1, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurrence = 1, summary = NULL) { + # parameter checks + if (system("which jellyfish", ignore.stdout = TRUE) != 0) { + stop("Required program jellyfish not found! Please check whether it is installed.") + } + + if (missing(input)) { + stop("No input specified! Please forward a valid bed-file.") + } + + if (!file.exists(input)) { + stop("File ", input, " does not exist!") + } + + if (!is.numeric(kmer) || kmer != as.integer(kmer) || kmer <= 0) { + stop("K-mer has to be a positive integer above 0.") + } + + if (!is.numeric(motif) || motif != as.integer(motif) || motif <= 0) { + stop("Motif has to be a positive integer above 0.") + } + + if (!is.numeric(threads) || threads != as.integer(threads) || threads < 0) { + stop("Threads has to be a positive integer (0 or greater).") + } + + if (!is.logical(clean)) { + stop("Parameter clean has to be a boolean value.") + } + + if (!is.numeric(minoverlap_kmer) || minoverlap_kmer != as.integer(minoverlap_kmer) || minoverlap_kmer <= 0) { + stop("Minoverlap_kmer has to be a positive integer above 0.") + } + + if (!is.numeric(minoverlap_motif) || minoverlap_motif != as.integer(minoverlap_motif) || minoverlap_motif <= 0) { + stop("Minoverlap_motif has to be a positive integer above 0.") + } + + if (!is.numeric(min_seq_length) || min_seq_length != as.integer(min_seq_length) || min_seq_length <= 0) { + stop("Min_seq_length hat to be a positive integer above 0.") + } + + if (!is.numeric(motif_occurrence) || motif_occurrence < 0 || motif_occurrence > 1) { # TODO remove motif_occurence > 1. See TODO of find_kmer_regions below. + stop("Motif_occurence has to be a numeric value above 0 and can not be greater than 1.") + } + + # get number of available cores + if (threads == 0) { + threads <- parallel::detectCores() + } + + message("Loading bed...") + # load bed + # columns: chr, start, end, name, ..., sequence + bed_table <- data.table::fread(input = input) + + # check for header and save it if provided + default_col_names <- grepl(pattern = "^V+\\d$", names(bed_table), perl = TRUE) + if (!any(default_col_names)) { + keep_col_names <- TRUE + col_names <- names(bed_table) + } else { + keep_col_names <- FALSE + } + + names(bed_table)[1:4] <- c("chr", "start", "end", "name") + names(bed_table)[ncol(bed_table)] <- "sequence" + # index + data.table::setkey(bed_table, name, physical = FALSE) + + # check for duplicated names + if (anyDuplicated(bed_table[, "name"])) { + warning("Found duplicated names. Making names unique.") + bed_table[, name := make.unique(name)] + } + + # remove sequences below minimum length + if (min_seq_length < max(c(kmer, motif))) { + stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of k-mer and motif).") + } + + total_rows <- nrow(bed_table) + bed_table <- bed_table[nchar(sequence) > min_seq_length] + if (total_rows > nrow(bed_table)) { + message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length) + } + + # TODO forward fasta file as parameter so no bed -> fasta conversion is needed. + message("Writing fasta...") + # save as fasta + fasta_file <- paste0(basename(input), ".fasta") + seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) + + message("Counting k-mer...") + # count k-mer + hashsize <- 4 ^ kmer + count_output_binary <- "mer_count_binary.jf" + input <- fasta_file + jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input) + + system(command = jellyfish_call, wait = TRUE) + + mer_count_table <- "mer_count_table.jf" + jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary) + + system(command = jellyfish_dump_call, wait = TRUE) + + message("Reduce k-mer.") + # load mer table + # columns: kmer, count + kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE) + # order k-mer descending + data.table::setorder(kmer_counts, -V2) + + # compute number of hits to keep + keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurrence = motif_occurrence) + + # reduce k-mer + reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) + message("Reduced kmer to first most frequent ", nrow(reduced_kmer), " out of ", nrow(kmer_counts), ".") + + message("Find k-mer in sequences.") + # find k-mer in sequences + # columns: name, start, end, width + ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) + names(ranges_table)[1:2] <- c("relative_start", "relative_end") + + # merge ranged_table with bed_table + keep column order + merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE] + + # delete sequences without hit + merged <- na.omit(merged, cols = c("relative_start", "relative_end")) + message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.") + + message("Reduce sequences.") + # create subsequences + merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] + + # bed files count from 0 + merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] + # change start end location + merged[, `:=`(start = start + relative_start, end = start + relative_end)] + + # clean table + merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] + + if (clean) { + file.remove(fasta_file, count_output_binary, mer_count_table) + } + + # write summary + if (!is.null(summary)) { + # script/ function call + file_name <- sub("--file=", "", commandArgs()[grep("--file=", commandArgs())]) + call <- ifelse(interactive(), deparse(match.call()), + paste("Rscript", basename(file_name), paste(commandArgs(trailingOnly = TRUE), collapse = " ")) + ) + + # sequence length summary + summary_input <- summary(nchar(bed_table[["sequence"]])) + summary_output <- summary(nchar(merged[["sequence"]])) + + # sequences removed + seq_to_small <- total_rows - nrow(bed_table) + no_hit_in_assembly <- nrow(bed_table) - nrow(merged) + total_removed <- seq_to_small + no_hit_in_assembly + + summary_data <- c( + "Call:", + paste0("\t", call), + "", + "Sequence length information:", + paste0("\t\t", paste0(names(summary_input), collapse = "\t")), + paste("\tInput", paste0(summary_input, collapse = "\t"), sep = "\t"), + paste("\tOutput", paste0(summary_output, collapse = "\t"), sep = "\t"), + "Total sequences:", + paste("\tInput", total_rows), + paste("\tOutput", nrow(merged)), + "Removed sequences:", + paste("\tBelow minimum size", seq_to_small), + paste("\tNo hit in assembly", no_hit_in_assembly), + paste("\tTotal", total_removed) + ) + + write(x = summary_data, file = summary) + } + + # keep provided column names + if (keep_col_names) { + names(merged) <- col_names + } + + data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names) +} + +#' Predict how many interesting k-mer are possible for the given data. +#' +#' @param bed Bed table with sequences in last column +#' @param kmer Length of k-mer +#' @param motif Length of motif +#' @param minoverlap Minimum number of bases overlapping between k-mer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). +#' @param motif_occurrence Define how many motifs are expected per sequence. Default = 1 +#' +#' @return Number of interesting k-mer. +significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurrence = 1) { + if (minoverlap > kmer || minoverlap > motif) { + stop("Kmer & motif must be greater or equal to minoverlap!") + } + if (motif_occurrence <= 0) { + stop("Motif_occurrence must be a numeric value above 0!") + } + + # minimum sequence length to get all interesting overlaps + min_seq_length <- motif + 2 * (kmer - minoverlap) + + seq_lengths <- nchar(bed[[ncol(bed)]]) + + # reduce to max interesting length + seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths) + + # calculate max possible k-mer + topx <- sum(seq_lengths - kmer + 1) + + return(topx * motif_occurrence) +} + +#' Orders k-mer table after count descending and keeps all k-mer with a cumulative sum below the given significance threshold. +#' +#' @param kmer K-mer data.table columns: kmer, count +#' @param significant Value from significant_kmer function. +#' +#' @return reduced data.table +reduce_kmer <- function(kmer, significant) { + data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) + + # TODO don't use 'V2' + kmer[, cumsum := cumsum(V2)] + + return(kmer[cumsum <= significant]) +} + +#' create list of significant ranges (one for each bed entry) +#' +#' @param bed Data.table of bed with sequence in last column +#' @param kmer_counts Data.table of counted k-mer. Column1 = kmer, column2 = count. +#' @param minoverlap Minimum overlapping nucleotides between k-mers to be merged. Positive integer. Must be smaller than k-mer length. +#' @param threads Number of threads. +#' +#' @return Data.table with relative positions and width (start, end, width). +#' +#' TODO Include number of motifs per sequence (aka motif_occurrence). Attempt to keep best 2 regions for occurrence = 2? Probably high impact on performance. +find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { + if (nchar(kmer_counts[1, 1]) <= minoverlap) { + stop("Minoverlap must be smaller than k-mer length!") + } + + names(kmer_counts)[1:2] <- c("kmer", "count") + data.table::setorder(kmer_counts, -count) + + seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { + seq <- bed[x][[ncol(bed)]] + name <- bed[x][[4]] + + #### locate ranges + ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) + + ranges <- na.omit(ranges, cols = c("start", "end")) + + if (nrow(ranges) < 1) { + return(data.table::data.table(start = NA, end = NA, width = NA, name = name)) + } + + # add k-mer sequences + ranges[, sub_seq := stringr::str_sub(seq, start, end)] + # add k-mer count + ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]] + + #### reduce ranges + reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]]) + + # list of overlapping ranges + edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE)) + + # get components (groups of connected ranges) + graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE) + # vector of node membership (indices correspond to ranges above) + member <- as.factor(igraph::components(graph)$membership) + + # list of membership vectors + node_membership <- lapply(levels(member), function(x) { + which(member == x) + }) + + # calculate component score (= sum of k-mer count) + score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) { + sum(kmer_counts[x, "count"]) + }) + + selected_ranges <- node_membership[[which(score == max(score))[1]]] + + # reduce selected ranges + reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) + + reduced_ranges <- data.table::as.data.table(reduced_ranges)[, name := name] + + return(reduced_ranges) + }) + + # create ranges table + conserved_regions_table <- data.table::rbindlist(seq_ranges) + + return(conserved_regions_table) +} + +# call function with given parameter if not in interactive context (e.g. run from shell) +if (!interactive()) { + # show apply progressbar + pbo <- pbapply::pboptions(type = "timer") + + # show help if called without arguments + if (length(commandArgs(trailingOnly = TRUE)) <= 0) { + print_help(opt_parser) + } else { + # remove last parameter (help param) + params <- opt[-length(opt)] + do.call(reduce_sequence, args = params) + } +} diff --git a/bin/2.2_motif_estimation/bed_to_fasta.R b/bin/2.2_motif_estimation/bed_to_fasta.R new file mode 100644 index 0000000..7e716b7 --- /dev/null +++ b/bin/2.2_motif_estimation/bed_to_fasta.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Second last column must be sequences and last column must be the cluster id.", metavar = "character"), + make_option(opt_str = c("-p", "--prefix"), default = "" , help = "Prefix for file names. Default = '%default'", metavar = "character"), + make_option(opt_str = c("-m", "--min_seq"), default = 100, help = "Minimum amount of sequences in clusters. Default = %default", metavar = "integer") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Convert BED-file to one FASTA-file per cluster.", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + +#' Splitting BED-files depending on their cluster. +#' The Sequences of each cluster are written as a FASTA-file. +#' @param bedInput BED-file with sequences and cluster-id as last two columns: +#' Sequence: second last column; Cluster ID: last column +#' @param prefix prefix for filenames +#' @param min_seq min. number of sequences per cluster +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +bed_to_fasta <- function(bedInput, prefix = "", min_seq = 100){ + + if (is.null(bedInput)) { + stop("ERROR: Input parameter cannot be null! Please specify the input parameter.") + } + + bed <- data.table::fread(bedInput, sep = "\t") + + # Get last column of data.table, which refers to the cluster, as a vector. + cluster_no <- bed[[ncol(bed)]] + + # Split data.table bed on its last column (cluster_no) into list of data.frames + clusters <- split(bed, cluster_no, sorted = TRUE, flatten = FALSE) + + # For each data.frame(cluster) in list clusters: + discard <- lapply(seq_len(length(clusters)), function(i){ + clust <- as.data.frame(clusters[i]) + # Filter data.tables(clusters), which are to small + if (nrow(clust) >= as.numeric(min_seq) ) { + # Get second last column, which contains the nucleotide sequences + sequences <- as.list(clust[[ncol(clust) - 1]]) + # Create filename + outfile <- paste0(prefix,"_cluster_",i - 1,".FASTA") + # Write fasta file + seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) + } else { + print(paste0("Cluster: ",i," is to small")) + } + }) +} + +# run function bed_to_fasta with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + bed_to_fasta(opt$input, opt$prefix, opt$min_seq) +} diff --git a/bin/2.2_motif_estimation/get_best_motif.py b/bin/2.2_motif_estimation/get_best_motif.py new file mode 100644 index 0000000..a506bd8 --- /dev/null +++ b/bin/2.2_motif_estimation/get_best_motif.py @@ -0,0 +1,64 @@ +''' +parses arguments using argparse +@return args list of all parameters +''' +def parse_arguments(): + parser = argparse.ArgumentParser(description='A script to convert from GLAM2 output to MEME-format and parsing only the [num] first motifs from file to the output.') + parser.add_argument("meme", help="Path to 'meme' file generated by GLAM2") + parser.add_argument("output", help="Output file") + parser.add_argument("num", help="Number of motifs parsed from file") + args = parser.parse_args() + return args + +''' +The script has two functions: + 1. Writing lines of file till certain line (MOTIF + [num]) + 2. Converting GLAM2 output to minimal meme-format +@params meme STRING Path to 'meme' file generated from Meme suite +@parmas output STRING Output file +@params num INT Number of motifs parsed from file + +@author René Wiegandt +@contact rene.wiegandt(at)mpi-bn.mpg.de +''' +def main(): + + args = parse_arguments() + out = open(args.output, "w+") + + ''' + Create pattern where script should stop writing + For Example: + If num == 3, which means that you want the first/best 3 Motifs, the script + should stop writing lines to output if loop reaches line 'MOTIF 4' + ''' + number = int(args.num) + 1 + break_header = "MOTIF " + str(number) + + # Pattern for motif header + pattern = re.compile("^MOTIF\s{2}(\d)+") + # Init count + count = 0 + + with open(args.meme) as f: + for line in f: + ## do not write [count] lines after each header -> needed for meme-format + if count > 0: + count-=1 + continue + if pattern.match(line): + # if line is a motif header + count = 2 + ## + + if break_header in line: + # line matches breaking_header, e.g. 'MOTIF 4' + break + else: + out.write(line) + + +if __name__ == "__main__": + import argparse + import re + main() diff --git a/bin/2.2_motif_estimation/label_cluster.R b/bin/2.2_motif_estimation/label_cluster.R new file mode 100644 index 0000000..60f1ec8 --- /dev/null +++ b/bin/2.2_motif_estimation/label_cluster.R @@ -0,0 +1,40 @@ +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from tomtom", metavar = "character"), + make_option(opt_str = c("-o", "--output"), default = NULL, help = "Output TSV-file.", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Adding Cluster ID to Query_ID Column.", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + +#' Adding Cluster ID to Query_ID Column +#' +#' @param input TSV-file. Output from tomtom. +#' @param input Output name. +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +label_cluster <- function(input, output){ + #Reading TSV-file + tsv <- data.table::fread(input, header = T, sep = "\t") + + #Getting cluster ID/number + splitted_name <- unlist(strsplit(input, "\\_|\\.")) + cluster_number <- as.numeric(splitted_name[length(splitted_name) - 1]) + 1 + + #Adding cluster ID to first column + tsv$Query_ID <- paste0(tsv$Query_ID, ".", cluster_number) + + + data.table::fwrite(tsv, file = output, sep = "\t", col.names = FALSE) +} + +# run function label_cluster with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + label_cluster(opt$input, opt$output) +} diff --git a/bin/2.2_motif_estimation/merge_similar_clusters.R b/bin/2.2_motif_estimation/merge_similar_clusters.R new file mode 100644 index 0000000..d9b2373 --- /dev/null +++ b/bin/2.2_motif_estimation/merge_similar_clusters.R @@ -0,0 +1,79 @@ +#!/usr/bin/env Rscript +if (!require(optparse)) install.packages("optparse"); library(optparse) + +option_list <- list( + make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input TSV-file. Output from merged tomtom results", metavar = "character"), + make_option(opt_str = c("-l", "--list"), default = NULL, help = "Numerically sorted whitespace separated list of absolute fasta-file paths", metavar = "character"), + make_option(opt_str = c("-w", "--min"), default = NULL, help = "Minimum weight of edge allowed in graph clusters.", metavar = "character") +) + +opt_parser <- OptionParser(option_list = option_list, + description = "Adding Cluster ID to Query_ID Column", + epilogue = "Author: Rene Wiegandt ") + +opt <- parse_args(opt_parser) + +#' Merging FASTA-files, which motifs are similar. +#' +#' @parameter tsv_in Path to TSV file generated by Tomtom. +#' The input for Tomtom is a from all clusters merged meme-file. +#' @parameter file_list Numerically sorted comma separated list of absolute fasta-file paths +#' @parameter min_weight Minimum weight of edge allowed in graph clusters. +#' +#' @author René Wiegandt +#' @contact rene.wiegandt(at)mpi-bn.mpg.de +merge_similar <- function(tsv_in, file_list, min_weight){ + + files <- unlist(strsplit(file_list, ",")) + + # split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. + tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') + query_cluster <- vapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ + tail(l, n = 1) + },"") + target_cluster <- vapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ + tail(l, n = 1) + },"") + + # create data.table with only the cluster-numbers + sim_not_unique <- data.table::data.table(as.numeric(query_cluster),as.numeric(target_cluster)) + + # remove rows if column 1 is identical to column 2 + edgelist <- sim_not_unique[query_cluster != target_cluster] + + # create graph from data.frame + g <- igraph::graph_from_edgelist(as.matrix(edgelist)) + # converting graph to adjacency matrix + adj_matrix <- igraph::get.adjacency(g, names = T) + # generating weighted graph from adjacency matrix + g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) + + # get subgraphs from graph with edges of weight > min_weight + s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight > min_weight], del = F) + png('motif_clusters.png') + plot(s1) + dev.off() + clust <- igraph::clusters(s1) + if (clust$no < 1) { + b <- lapply(files, function(f){ + system(paste("cat",f,">",basename(f))) + }) + } + # merge FASTA-files depending on the clustered graphs + a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ + #get vector with cluster ids, which are clustered together in "motif cluster" i + cl <- as.vector(which(clust$membership %in% c(i))) + #create string, which represents a list, containing all FASTA-files to be merged + fasta_list <- paste(files[cl], collapse = " ") + #create name for merged FASTA-file + name <- paste0("Cluster_",i,".fasta") + #merge fasta files + system(paste("cat",fasta_list,">",name)) + }) +} + + +# run function merge_similar with given parameteres if not in interactive context (e.g. run from shell) +if (!interactive()) { + merge_similar(opt$input, opt$list, opt$min) +} diff --git a/bin/motif_estimation.nf b/bin/2.2_motif_estimation/motif_estimation.nf similarity index 100% rename from bin/motif_estimation.nf rename to bin/2.2_motif_estimation/motif_estimation.nf diff --git a/bin/Modules/CrossMap.py b/bin/3.1_create_gtf/Modules/CrossMap.py similarity index 100% rename from bin/Modules/CrossMap.py rename to bin/3.1_create_gtf/Modules/CrossMap.py diff --git a/bin/Modules/CrossMapper.py b/bin/3.1_create_gtf/Modules/CrossMapper.py similarity index 59% rename from bin/Modules/CrossMapper.py rename to bin/3.1_create_gtf/Modules/CrossMapper.py index d401c79..82beace 100644 --- a/bin/Modules/CrossMapper.py +++ b/bin/3.1_create_gtf/Modules/CrossMapper.py @@ -10,26 +10,49 @@ class CrossMapper: Class to download chain_files for chrossmapping hg38 or mm10 to older assembly versions. Utilizes CrossMap.py. see wiki for more information. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ def __init__(self, org, wd, out, is_dir): - self.org = org + + # Constructor for CrossMapper class + # input_parameter: org = input organism + # wd = working directory + # out = path to output-file -> Parameter + # is_dir = boolean if wd is data_dir or just working directory + + # Get path to tempfile / outputfile and chain-file + if is_dir: - self.infile = os.path.join( wd + "/temp/" + org + ".gtf") + self.infile = os.path.join(wd + "/temp/" + org + ".gtf") else: self.infile = os.path.join(wd+"/data/temp/"+org+".gtf") - self.outfile = os.path.join(out+"/" + org + "_mapped.gtf") + self.outfile = os.path.join(out) self.chainfile = self.get_chain_file(org, wd, is_dir) - # Execute Crossmapper for gff/gtf files + + # Execute Crossmap for gff/gtf files + (mapTree, targetChromSizes, sourceChromSizes) = CrossMap.read_chain_file(self.chainfile) + + # Map results and save output to self.outfile + CrossMap.crossmap_gff_file(mapTree, self.infile, self.outfile) - def get_chain_file(self, org, wd, isdir): + def get_chain_file(self, org, wd, is_data_dir): # Defines the Chain files for different conversions. + # input_parameter: org = organism + # wd = working directory + # is_data_dir = is wd data_dir or not + + # return_value: Link to chain-file for conversion. + # Custom chain-files and chain-files for more organism can be specified in this section if org == "hg19": - if isdir: + if is_data_dir: file_link = os.path.join(wd+"temp/hg38tohg19.over.chain.gz") else: file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) @@ -39,7 +62,7 @@ def get_chain_file(self, org, wd, isdir): return file_link elif org == "mm9": - if isdir: + if is_data_dir: file_link = os.path.join(wd+"temp/mm10ToMm9.over.chain.gz") else: file_link = os.path.join(wd + "/data/temp/hg38tohg19.over.chain.gz" ) diff --git a/bin/Modules/Ensembl/ActivityCategorizer.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py similarity index 58% rename from bin/Modules/Ensembl/ActivityCategorizer.py rename to bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py index bd82a92..93b3558 100644 --- a/bin/Modules/Ensembl/ActivityCategorizer.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityCategorizer.py @@ -4,13 +4,27 @@ class ActivityCategorizer: + """ + + Class that categorizes activitydata based on json config and binary activitydata (table.bin). + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ + def __init__(self, release, organism, wd, data_dir): - # List of all Folders with Activity Tables + # Constructor for ActivityCategorizer + # input_parameter: organism = input organism + # release = current used Ensembl release + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + + # List of all folders with activity Tables self.folderlist = [] - # Dictionary from celltypes_organism.json mit key = Kategorie und Value = [Ordner] + # Dictionary from celltypes_organism.json mit key = category and Value = [directory] self.c_dict = self.read_config(organism, wd) @@ -27,10 +41,19 @@ def __init__(self, release, organism, wd, data_dir): print("Categorization finished !") def get_categorization(self): + + # Getter method to return the self.categorization variable + return self.categorization def read_config(self, organism, wd): + # Method to read the celltypes_organism.json config file + # input_parameter: organism = input organism + # wd = working directory to find the config files. + # return_value: Dictionary with ensembl aliases based on config + # -> Key = type (from config), value = list of ensembl aliases + c_dict = {} path_to_config = os.path.join(wd +"/config/celltypes_"+organism+".json") with open(path_to_config) as input_file: @@ -43,6 +66,13 @@ def read_config(self, organism, wd): def get_activity_data(self, release, organism, wd, data_dir): + # Method to read the binary table.bin file and return its content as bytearray + # input_parameter: organism = input organism + # release = current used Ensembl release + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + # return_value: bytearray with activitystatus + for folder in self.folderlist: # Generate path to binary File if data_dir: @@ -53,6 +83,9 @@ def get_activity_data(self, release, organism, wd, data_dir): self.activity[folder] = bytearray(tables.read()) def generate_categorized_activity(self): + + # Categorizes the activity by config defined categories. + category_activity = {} for category, aliases in self.c_dict.items(): @@ -80,10 +113,16 @@ def generate_categorized_activity(self): def activity_comparator(self, aliaslist): + # Method to determine the resulting activitystatus if the entry contains + # multiple differing activitystatus from aliases + # e.g. if one alias is ACTIVE and one INACTIVE the result will be ACTIVE -> see wiki for more detailed info + # input_parameter: aliaslist = list of aliases for activity_data + # return_value: Array of Activitystatus by category (type in config) + concatenated_array = bytearray([]) length = len(self.activity[aliaslist[0]]) - input_arrays = [self.activity[x] for x in aliaslist] + input_arrays = [self.activity[index] for index in aliaslist] for x in range(length): if any(y[x] == 0 for y in input_arrays): concatenated_array.append(0) @@ -103,4 +142,4 @@ def activity_comparator(self, aliaslist): # e = ActivityCategorizer("../../config/celltypes_human.json", "release-94", "homo_sapiens") # print(len(e.categorization)) # for x in e.categorization.values(): -# print(len(x)) \ No newline at end of file +# print(len(x)) diff --git a/bin/Modules/Ensembl/ActivityTable.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py similarity index 67% rename from bin/Modules/Ensembl/ActivityTable.py rename to bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py index 288bf7d..1354356 100644 --- a/bin/Modules/Ensembl/ActivityTable.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTable.py @@ -8,19 +8,34 @@ class ActivityTable: Class for checking activity_table and generating them. activityTable = byte Representation of activity status corresponding to the generator schema default: + 0, "activity=ACTIVE", 1, "activity=POISED", 2, "activity=REPRESSED", 3, "activity=INACTIVE", 4, "activity=NA" + + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, current_release, wd, data_dir): + + # Constructor for the ActivityTable-Class + # input_parameter: organism = input organism + # current_release = current used Ensembl release + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + if data_dir: self.link = os.path.join(data_dir + "/EnsemblData/", current_release, organism, "activity") else: self.link = os.path.join(wd + "/data/EnsemblData/", current_release, organism, "activity") self.folders = next(os.walk(self.link))[1] + + # List to represent Index with activitystatus for ATGenerator class + self.generator = ATGenerator(["activity=ACTIVE", "activity=POISED", "activity=REPRESSED", @@ -28,17 +43,29 @@ def __init__(self, organism, current_release, wd, data_dir): "activity=NA"]) def check_and_generate_activity_table(self): - # checks if file already exists and generates new one if missing + + # checks if file (table.bin) already exists for celltype -> generates new one if missing + for subfolder in self.folders: folder_link = os.path.join(self.link, subfolder) sf_link = os.path.join(folder_link, "table.bin") + + # If table.bin is missing: + if not os.path.isfile(sf_link): print("No ActivityTable for "+subfolder+" found, generating new one.") self.generate_table(folder_link) + + # Else: Do nothing + print("All ActivityTables found, proceeding") def generate_table(self, link): + # generates the table and saves it as table.bin file + # input_parameter: link = link to ensembl activity folder for specific celltype + # generates table.bin file in link folder + for root, dirs, files in os.walk(link): for file in files: if file.endswith(".gff.gz"): @@ -47,7 +74,8 @@ def generate_table(self, link): with open(file_path, "wb") as f: f.write(self.generator.read_table(originpath)) print("New ActivityTable generated in: " + root) -# Debug -#e = ActivityTable("homo_sapiens", "release-94") -#e.check_and_generate_activity_table() \ No newline at end of file + +# Debug +# e = ActivityTable("homo_sapiens", "release-94") +# e.check_and_generate_activity_table() diff --git a/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py new file mode 100644 index 0000000..aff787a --- /dev/null +++ b/bin/3.1_create_gtf/Modules/Ensembl/ActivityTableGenerator.py @@ -0,0 +1,37 @@ +import gzip + + +class ATGenerator: + + """ + + Reads gzip files for activitydata and generates a bytearray with activitystatus. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ + + def __init__(self, repre): + + # Constructor with parameter representation: List of keywords with a corresponding index + # Only the index as byte will be appended to a bytearray. + + self.representation = repre + + def read_table(self, file): + + # Reads the activity-ensembl file ("/data/EnsemblData/release/organism/activity/*") + # and returns the activity as bytearray, based on the representation in self.representation. + # Only the index is saved -> see ActivityTable.py for current representation. + + activity_table = [] + with gzip.open(file, 'rb') as f: + for line in f: + for index, re in enumerate(self.representation): + if re in str(line): + activity_table.append(index) + break + return bytearray(activity_table) + + + diff --git a/bin/Modules/Ensembl/Ensembl.py b/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py similarity index 61% rename from bin/Modules/Ensembl/Ensembl.py rename to bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py index 88adeb1..8ab23ac 100644 --- a/bin/Modules/Ensembl/Ensembl.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/Ensembl.py @@ -7,23 +7,41 @@ class Ensembl: """ + Main class for handling Ensembl Regulatory data Checks for local files and downloads if files are missing + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, wd, data_dir): + + # Constructor and main method for Ensembl-GTF-Creation + # input_parameter: organism = input organism + # wd = working directory + # data_dir = use data_dir parameter if specified. + print("Starting Ensembl") + # Check and Update for Local Ensembl Release Data self.updater = FTPRetriever(organism, wd, data_dir) self.release = self.updater.get_release() + # Check for Activitytables (table.bin binary files) and generate if not existing self.acttable = ActivityTable(organism, self.release, wd, data_dir) self.acttable.check_and_generate_activity_table() + # Categorize the Activitytable by config defined categories (config: ./config/celltypes_organism.json) self.categorizer = ActivityCategorizer(self.release, organism, wd, data_dir) print("Generating GTF") + # Instatiate self.gtf_generator = GTFGen(organism, self.release, wd, data_dir) print("Ensembl Finished !") def get_gtf(self): + + # Getter Method for resulting GTF-Entries as List. + # return_value: list of gtf entries. + return self.gtf_generator.get_gtf(self.release, self.categorizer.get_categorization()) #e = Ensembl("homo_sapiens") diff --git a/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py new file mode 100644 index 0000000..531fd18 --- /dev/null +++ b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/FTPEntry.py @@ -0,0 +1,54 @@ +import ftplib + + +class FTPEntry: + + """ + + Class to determine if a ftp-path is file or directory. + Assigns every filename a parameter ('d' = directory or 'f' = file) + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ + + def __init__(self, filename, ftpobj, startingdir=None): + + # Constructor + # input_parameter: filename = the files/directorys name + # ftpobj = Current Instance of FTPLib from URLRetrieve + # startingdir = optional parameter for ftp starting directory (to reduce browsing in ftp) + + self.filename = filename + if startingdir is None: + startingdir = ftpobj.pwd() + + # Try if "filename" is directory + + try: + ftpobj.cwd(filename) + self.filetype = 'd' + ftpobj.cwd(startingdir) + + # If error_perm occurs filename is file not directory + + except ftplib.error_perm: + self.filetype = 'f' + + def gettype(self): + + # Getter method for filetype + + return self.filetype + + def getfilename(self): + + # Getter method for filenname + + return self.filename + + def __repr__(self): + + # Change the represenation scheme for FTPEntry object. + + return self.filename, self.filetype diff --git a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/URLRetrieve.py similarity index 50% rename from bin/Modules/Ensembl/FTPHandling/URLRetrieve.py rename to bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/URLRetrieve.py index 6b45598..ee4ce83 100644 --- a/bin/Modules/Ensembl/FTPHandling/URLRetrieve.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/URLRetrieve.py @@ -5,30 +5,58 @@ class FTPHandler: """ - Class to browse through ftp folders and download entries to local file + Class to browse through ftp folders and download entries to local file + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, url, wd): + + # Constructor + # input_parameters: wd = woking directory + # url = Url where to browse (in this case ftp.ensembl.org) + self.ftp = ftplib.FTP(url) self.ftp.login() self.ftp.cwd(wd) - def change_dir(self, wd): - self.ftp.cwd(wd) + def change_dir(self, to_dir): + + # Change ftp current working directory to parameter to_dir + + self.ftp.cwd(to_dir) def get_all_entries(self): + + # Get all ftp entries at current working directory + return self.ftp.nlst() def get_all_entries_from_dir(self, dire): + + # Helper method to get all entries in directory (dire) + # input_parameter: dire = directory where to get all entries + # return value: list of all entries + self.change_dir(dire) return self.get_all_entries() def get_all_entries_as_FTPEntry(self): - # Get All Files + + # Get All Files as FTPEntry + # returns list of FTPEntry objects, these objects are helper objects for easier determination if a + # FTP-Entry is file or Folder + files = self.ftp.nlst() return [FTPEntry(item, self.ftp, self.ftp.pwd()) for item in files] def save_entries_to_file(self, origin, target): + + # Method to save targeted entries to file + # input_parameter: origin = Origin FTP-Entries + # target = directory, where to save the files. + self.change_dir(origin) for file in self.get_all_entries_as_FTPEntry(): if file.gettype() == "f": diff --git a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/VersionChecker.py similarity index 68% rename from bin/Modules/Ensembl/FTPHandling/VersionChecker.py rename to bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/VersionChecker.py index d4c1ca6..69edef5 100644 --- a/bin/Modules/Ensembl/FTPHandling/VersionChecker.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/VersionChecker.py @@ -5,11 +5,20 @@ class EnsemblRegulationFTPRetriever: """ - Class for checking current version locally and remote on ftp. - And downloading newest version if necessary + Class for checking current version locally and remote on ftp. + And downloading newest version if necessary + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, wd, data_dir): + + # Constructor: + # input_parameter: organism = input organism + # wd = working dir (default working directory, data_dir is used if specified) + # data_dir = data directory (this is used as directory if specified) + self.site_ftp = FTPHandler("ftp.ensembl.org", "pub") self.remoteversion = self.get_current_ftp_version() self.localversion = self.get_current_local_version(wd, data_dir) @@ -19,9 +28,16 @@ def __init__(self, organism, wd, data_dir): print("Newest Version installed, no update needed.") def get_release(self): + + # Getter method for release version from FTP. + return self.remoteversion def get_current_ftp_version(self): + + # Gets the current ftp-version from ftp.ensembl.org + # return_value: string for current release on FTP + entries = self.site_ftp.get_all_entries() versionlist = [] for entry in entries: @@ -32,6 +48,11 @@ def get_current_ftp_version(self): return c_release def check_organism(self, organism, release, wd, data_dir): + + # Check if organism is locally existing + # input_parameter: as in __init__ + # return_value: Boolean if data locally exists or not + if data_dir: if organism in next(os.walk(os.path.join(data_dir+"/EnsemblData/"+release+"/")))[1]: return False @@ -46,6 +67,11 @@ def check_organism(self, organism, release, wd, data_dir): return True def get_current_local_version(self, wd, data_dir): + + # Method to check for the current local version + # input_parameters: wd, data_dir as in __init__() + # return_value: String for local release_version or if not existing None + if data_dir: directories = next(os.walk(os.path.join(data_dir + "/EnsemblData/")))[1] else: @@ -63,6 +89,10 @@ def get_current_local_version(self, wd, data_dir): def check_version_difference(self, organism, wd, data_dir): + # Method to check if local version is differing from remote version + # input_parameters: wd, data_dir, organism as in __init__() + # return_value: Boolean if the version is differing or not + local_version = self.localversion remote_version = self.remoteversion if local_version is None: @@ -81,6 +111,12 @@ def check_version_difference(self, organism, wd, data_dir): def download_currentversion_version(self, version, organism, wd, data_dir): + # Method to download current version from FTP if local version is not up-to-date + # input_parameters: version = version to download + # organism = input organism + # wd = working directory + # data_dir = data directory + # Download Base File if data_dir: targetfolder = os.path.join(data_dir + "/EnsemblData/", version, organism) @@ -93,13 +129,13 @@ def download_currentversion_version(self, version, organism, wd, data_dir): # Download Regulation Activity - activityfolder_local = os.path.join(targetfolder, "activity") # local Folder for Activity Data - activityfolder_remote = folder_url+"RegulatoryFeatureActivity/" # remote (ftp) folder for activity data + activityfolder_local = os.path.join(targetfolder, "activity") # local Folder for Activity Data + activityfolder_remote = folder_url+"RegulatoryFeatureActivity/" # remote (ftp) folder for activity data os.mkdir(activityfolder_local) # Create New Folder + # Get List for all entries in activity Folder + celltypes_list = self.site_ftp.get_all_entries_from_dir(activityfolder_remote) - celltypes_list = self.site_ftp.get_all_entries_from_dir(activityfolder_remote) # Get List for all entries in activity Folder - - # Iterate over Celltype List and Download in corresponding subfolder + # Iterate over celltype List and Download in corresponding subfolder for celltype in celltypes_list: link_local = os.path.join(activityfolder_local, celltype) @@ -108,4 +144,5 @@ def download_currentversion_version(self, version, organism, wd, data_dir): self.site_ftp.save_entries_to_file(link_origin, link_local) -#e = EnsemblRegulationFTPRetriever("mus_musculus") \ No newline at end of file +# Debug section +# e = EnsemblRegulationFTPRetriever("mus_musculus") diff --git a/bin/Modules/Ensembl/FTPHandling/__init__.py b/bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/__init__.py similarity index 100% rename from bin/Modules/Ensembl/FTPHandling/__init__.py rename to bin/3.1_create_gtf/Modules/Ensembl/FTPHandling/__init__.py diff --git a/bin/Modules/Ensembl/GTFGen.py b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py similarity index 59% rename from bin/Modules/Ensembl/GTFGen.py rename to bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py index ca57aa8..eaf88f6 100644 --- a/bin/Modules/Ensembl/GTFGen.py +++ b/bin/3.1_create_gtf/Modules/Ensembl/GTFGen.py @@ -1,24 +1,37 @@ import os import gzip -import csv class GTFGen: - """ Class to generate Ensembl GTF-data with activity + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, organism, release, wd, data_dir): + # Constructor for GTFGen + # input_parameter: organism = input organism + # release = used Ensembl release + # wd = working directory (default is "."), this is used if data_dir is not specified. + # data_dir = data directory (if specified this is used instead of wd) + self.gff_lines = self.get_organism_as_gff(organism, release, wd, data_dir) + + # Map to assign numbers from Activitytable-binary to activity status + self.value_map = {0: "ACTIVE", 1: "POISED", 2: "REPRESSED", 3: "INACTIVE", 4: "NA"} def get_organism_as_gff(self, organism, release, wd, data_dir): # reads the original gff file for organism + # input_parameter as in __init__ described. + # return_value: list of gff-entries + if data_dir: directory = os.path.join(data_dir + "/EnsemblData/", release, organism) else: @@ -31,9 +44,12 @@ def get_organism_as_gff(self, organism, release, wd, data_dir): with gzip.open(inputfile) as original_file: return original_file.readlines() - def reformat_to_gff(self, activity, release): + def reformat_to_gtf(self, activity, release): # Reformats gff to gtf and appends activity-data for config specified celltype-categories + # input_parameter: activity = list of activity status for all genes + # release = current ensembl release + # return_value: List of gtf-formatted entries gtf_return = [] @@ -51,7 +67,7 @@ def reformat_to_gff(self, activity, release): # Add RegBuild_ + release templist.append("RegBuild_"+release) # Add Description from Description in last ; separated segment - templist.append(splitted_additional[4].split("=")[1].lower()) + templist.append(splitted_additional[4].split("=")[1].lower().replace(' ', '_')) # Add Start / End Data from original templist.extend(splitted[3:5]) # Add Score, Strand and Frame Data @@ -68,24 +84,39 @@ def reformat_to_gff(self, activity, release): @staticmethod def generate_additional_information(gene_id, activity): + # helper method to concat activity information to string and reformat from gff to gtf-style + # input_parameter: gene_id = gene_id formatted in gff format + # activity = List of activity-data for specified gene + # return_value: String for attributes (column 9) in gtf-format + if gene_id.startswith("ID=regulatory_region:"): - gene_id = 'ID "'+gene_id.split(':')[1]+'"' + gene_id = 'gene_id "'+gene_id.split(':')[1]+'"' + elif gene_id.startswith("ID=E"): + gene_id = 'gene_id "'+gene_id.split('=')[1]+'"' activity_string = 'activity "'+', '.join(activity)+'"' - # helper method to concat activity information to string - return gene_id+'; '+activity_string + return gene_id+'; '+activity_string+';' def generate_activity_list(self, activity, index): - # generates activity list + + # generates activity list for a specified index + # input_parameter: index = index for a specified gene + # activity = List of activity-data for all entries + # return_value: List of activity for gene at index + activity_list = [] for key, value in activity.items(): activity_list.append(key+">"+self.value_map[value[index]]) return activity_list def get_gtf(self, release, activity): - # returns the resulting gtf-formatted-list - return self.reformat_to_gff(activity, release) + + # getter function for the resulting gtf-formatted-list + # input_parameters: release, activity as in self.reformat_to_gtf() + # return_value: List of GTF-Entries + + return self.reformat_to_gtf(activity, release) diff --git a/bin/Modules/Ensembl/__init__.py b/bin/3.1_create_gtf/Modules/Ensembl/__init__.py similarity index 100% rename from bin/Modules/Ensembl/__init__.py rename to bin/3.1_create_gtf/Modules/Ensembl/__init__.py diff --git a/bin/3.1_create_gtf/Modules/SaveResults.py b/bin/3.1_create_gtf/Modules/SaveResults.py new file mode 100644 index 0000000..e0b39e1 --- /dev/null +++ b/bin/3.1_create_gtf/Modules/SaveResults.py @@ -0,0 +1,42 @@ +import os + + +class ResultSaver: + + """ + + Class to save the results. The output is saved to the temp directory in the data folder if crossmapping is necessary. + + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + + """ + + def __init__(self, results, organism, wd, mapped, is_data_dir, out): + + # Constructor and main method for result-saving + # input_parameter: results = finished list of gtf-entries + # organism = input_organism + # wd = working directory + # mapped = boolean if crossmapping is necessary + # is_data_dir = boolean if wd is a data_dir (true) or not (false) + + print("Save results to File !") + self.path = "" + + # If results has to be crossmapped -> Path is tempdirectory. + + if mapped: + if is_data_dir: + self.path = os.path.join(wd + "/temp/" + organism + ".gtf") + else: + self.path = os.path.join(wd + "/data/temp/" + organism + ".gtf") + + # if no crossmapping is needed: + else: + self.path = os.path.join(out) + + with open(self.path, "w") as output_file: + for line in results: + output_file.write("\t".join(line)+"\n") diff --git a/bin/Modules/Uniquifier.py b/bin/3.1_create_gtf/Modules/Uniquifier.py similarity index 52% rename from bin/Modules/Uniquifier.py rename to bin/3.1_create_gtf/Modules/Uniquifier.py index 16d5538..89890c8 100644 --- a/bin/Modules/Uniquifier.py +++ b/bin/3.1_create_gtf/Modules/Uniquifier.py @@ -1,23 +1,38 @@ class UniqueFilter: - """ Class to get unique GTF-results, filtered by specified cell-/tissuetypes + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + """ def __init__(self, ense, ucsc, org_filter=None): + + # Constructor + # input_parameter: ense = list of gtf-formatted entries from Ensembl data + # ucsc = list of gtf-formatted entries from UCSC data + # org_filter = filter for specific celltype + self.results = self.get_filtered_results(org_filter, ense, ucsc) def get_results(self): + + # Getter method for results variable + return self.results def get_filtered_results(self, org_filter, ense, ucsc): - # Apply Filter + # Method to concat ucsc and ensemble dataset without duplicates and filter by activitylist + # input_parameter: ense = list of gtf-formatted entries from Ensembl data + # ucsc = list of gtf-formatted entries from UCSC data + # org_filter = filter for specific celltype + # return_value: List of unique (filtered) results. - unfiltered_results = self.concat_without_duplicates(ense, ucsc) - if org_filter: + unfiltered_results = self.concat_without_duplicates(ense, ucsc) # First: Concat ucsc and ensembl data + if org_filter: # Second: apply filter if specified filterstrings = [x+">ACTIVE" for x in org_filter] return_list = [] for element in unfiltered_results: @@ -32,6 +47,8 @@ def get_filtered_results(self, org_filter, ense, ucsc): def concat_without_duplicates(ense, ucsc): # Concat UCSC and Ensembl data without duplicates + # input_parameter: ense = ensembl-gtf-data and ucsc = ucsc-gtf-data + # return_value: concatinated list of gtf-entries without duplicates results = ense+ucsc for ens in ense: diff --git a/bin/3.1_create_gtf/Modules/Validator.py b/bin/3.1_create_gtf/Modules/Validator.py new file mode 100644 index 0000000..ee34471 --- /dev/null +++ b/bin/3.1_create_gtf/Modules/Validator.py @@ -0,0 +1,35 @@ +class Validator: + + """ + Class to validate the gtf-output-file. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de + + """ + + def __init__(self, out_file): + + # Constructor + # input_parameter: out_file = path to output file + + self.out_file = out_file + self.test_read_file() + + def test_read_file(self): + + # Method to test the output file-format + + with open(self.out_file) as outfile: + line = outfile.readline() + + if line: + if len(line.split("\t")) == 9: + print("Validation complete no errors found !") + else: + print("Incorrect output Format: Try again after deleting data folder.") + else: + print("Outputfile not in correct Format: Try again after deleting data Folder.") + exit(1) + +# Debug +# v = Validator("/home/basti/Schreibtisch/test_hg38.gtf") diff --git a/bin/Modules/__init__.py b/bin/3.1_create_gtf/Modules/__init__.py similarity index 100% rename from bin/Modules/__init__.py rename to bin/3.1_create_gtf/Modules/__init__.py diff --git a/bin/Modules/ucsc/__init__.py b/bin/3.1_create_gtf/Modules/ucsc/__init__.py similarity index 100% rename from bin/Modules/ucsc/__init__.py rename to bin/3.1_create_gtf/Modules/ucsc/__init__.py diff --git a/bin/Modules/ucsc/bigBedToBed b/bin/3.1_create_gtf/Modules/ucsc/bigBedToBed similarity index 100% rename from bin/Modules/ucsc/bigBedToBed rename to bin/3.1_create_gtf/Modules/ucsc/bigBedToBed diff --git a/bin/Modules/ucsc/ucsc.py b/bin/3.1_create_gtf/Modules/ucsc/ucsc.py similarity index 52% rename from bin/Modules/ucsc/ucsc.py rename to bin/3.1_create_gtf/Modules/ucsc/ucsc.py index a554a4d..23aaf91 100644 --- a/bin/Modules/ucsc/ucsc.py +++ b/bin/3.1_create_gtf/Modules/ucsc/ucsc.py @@ -8,17 +8,32 @@ class UcscGtf: """ - Class to gather ucsc refSeq-FuncElem data. + + Class to gather data from UCSC Table Browsers, RefFuncGen Tracks. + @author: Sebastian Beyvers + @contact: sebastian.beyvers@med.uni-giessen.de """ def __init__(self, org, wd, data_dir): + + # Constructor for UcscGtf + # input parameter: org = organism + # wd = working directory + # data_dir = data_directory (if defined this is used to save and get data) + self.organism_id = self.get_organism_id(org) + + # FTPlink to UCSC bigbed File + self.link = "http://hgdownload.soe.ucsc.edu/gbdb/"+self.organism_id+"/ncbiRefSeq/refSeqFuncElems.bb" + + # Where to save the output file if data_dir: self.output = os.path.join(data_dir + "/UCSCData" + self.organism_id+".bed") else: self.output = os.path.join(wd + "/data/UCSCData/" + self.organism_id+".bed") + # Determine path to bigBedToBed binary. self.path_to_bin = os.path.join(wd + "/Modules/ucsc/bigBedToBed") print("Getting UCSC Data") print("Path to Bin: " + self.path_to_bin) @@ -28,44 +43,61 @@ def __init__(self, org, wd, data_dir): print("UCSC finished !") def generate_gff_file(self): - # Call bigBedToBed binary to get a Bed file in the UCSCData folder + + # Call bigBedToBed binary to get a BED-file in the UCSCData folder + callstring = [self.path_to_bin, self.link, self.output] subprocess.call(callstring) def read_gff_to_gtf(self): - # Reads Bed File and return a gtf-formatted list of elements. + + # Reads BED-file and return a GTF-formatted list of elements. + # return_value: GTF-formatted List of regulation entries from UCSC + gtf_lines = [] with open(self.output, 'r') as csvfile: tsvreader = csv.reader(csvfile, delimiter='\t') for row in tsvreader: - sequence = [] - sequence.append(row[0]) - sequence.append("UCSC") - sequence.append(row[3].lower()) - sequence.append(row[1]) - sequence.append(row[2]) - sequence.append(".") - sequence.append(row[5]) - sequence.append(".") - sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])) - gtf_lines.append(sequence) + if row[9] not in ["region", "sequence_feature", + "CAAT_signal", "stem_loop", + "sequence_secondary_structure"]: + + sequence = [] + sequence.append(row[0]) + sequence.append("UCSC") + sequence.append(row[9].lower().replace(' ', '_')) + sequence.append(row[1]) + sequence.append(row[2]) + sequence.append(".") + sequence.append(row[5]) + sequence.append(".") + sequence.append('; '.join([self.find_ID(''.join(row[11:])), 'activity \"'+", ".join(self.get_activity(''.join(row[11:]))) + '"'])+";") + gtf_lines.append(sequence) return gtf_lines def find_ID(self, line): + # Find RefSeq ID in Line + # input_parameter: line = current line from BED-file + # return_value: string with gene_id in GTF-format + pattern = re.compile(r'ID:[0-9]{,9}|$') ref_id = re.search(pattern, line).group() splitted = ref_id.split(":") if len(splitted) == 2: - returnstring = str(splitted[0])+' "'+str(splitted[1])+'"' + returnstring = 'gene_id "'+str(splitted[1])+'"' else: - returnstring = 'ID '+'"NA"' + returnstring = 'gene_id "NA"' return returnstring def get_activity(self, line): - # Find activity categories in bed file + + # Find activity categories in BED-file + # input_parameter: line = current line from BED-file + # return_value: list with activity for specified line("keystatus") + key_status = [] for key, value in self.ucsc_categories.items(): if value: @@ -79,7 +111,11 @@ def get_activity(self, line): @staticmethod def get_organism_id(org): + # convert intern name e.g. "homo_sapiens" to ucsc name "hg38". + # input_parameter: org = organism parameter + # return_value: UCSC alias for this organism [ mm10 | hg38 ] + if org == "homo_sapiens": return "hg38" elif org == "mus_musculus": @@ -87,7 +123,12 @@ def get_organism_id(org): @staticmethod def get_activity_categories(organism, wd): + # Method to get ucsc-celltype categories from JSON config + # input_parameter: organism = organism parameter + # wd = working directory, to find config file + # return_value: List of categories from config. + path_to_config = os.path.join(wd+"/config/celltypes_" + organism + ".json") categories = {} with open(path_to_config) as input_file: @@ -98,4 +139,8 @@ def get_activity_categories(organism, wd): return categories def get_gtf(self): + + # Getter method for resulting gtf-lines + # return_value: List of gtf-formatted Strings (Lines) + return self.gtf_lines diff --git a/bin/RegGTFExtractor.py b/bin/3.1_create_gtf/RegGTFExtractor.py old mode 100755 new mode 100644 similarity index 82% rename from bin/RegGTFExtractor.py rename to bin/3.1_create_gtf/RegGTFExtractor.py index 355d4da..6bdd251 --- a/bin/RegGTFExtractor.py +++ b/bin/3.1_create_gtf/RegGTFExtractor.py @@ -1,7 +1,9 @@ +#!/usr/bin/env python + """ RegGTFExtractor.py extracts regulatory-data from Ensembl and UCSC databases -and converts output to GTF-formatted file. +and converts the output to a GTF-formatted file. @author: Sebastian Beyvers @contact: sebastian.beyvers@med.uni-giessen.de @@ -14,6 +16,7 @@ from Modules.Uniquifier import UniqueFilter from Modules.SaveResults import ResultSaver from Modules.CrossMapper import CrossMapper +from Modules.Validator import Validator import os import json @@ -21,6 +24,8 @@ def check_for_local_folder(wd): # Check if local folder exists and create if missing when no data_dir is specified + # input_parameter: wd = working directory + # return_value: None if not os.path.isdir(os.path.join(wd+"/data/")): @@ -40,6 +45,8 @@ def check_for_local_folder(wd): def check_for_data_dir(data_dir): # Check if local folder exists and create if missing when data_dir as parameter is specified + # input_parameter: data_dir = data directory + # return_value: None if not os.path.isdir(os.path.join(data_dir)): os.mkdir(os.path.join(data_dir)) @@ -57,6 +64,8 @@ def check_for_data_dir(data_dir): def check_filter(tissue_cmd, org, wd): # Checks if filter-celltype is in Json types for organism + # input_parameter: tissue_cmd: Filtered tissuetypes; org = organism; wd = working directory + # return_value: boolean if selected filter is in config path_to_config = os.path.join(wd + "/config/celltypes_" + org + ".json") tissues_config = [] @@ -77,6 +86,8 @@ def check_filter(tissue_cmd, org, wd): def check_organism(org): # Checks the organism input and decides if chrossmapping is necessary + # input_parameter: org = input organism (parameter) + # return_value: tuple with values = (organism_alias (string), boolean if chrossmapping is needed) if org == "hg38": return "homo_sapiens", False @@ -92,7 +103,12 @@ def check_organism(org): def main_script(organism, wd, data_dir, out, tissuetype=None): - # Main function + # main function + # input_parameter: all parameters from argparse + + # if no output parameter is given output file is "./organism.gtf" + if out == ".": + out = "./"+organism+".gtf" (org, x_mappable) = check_organism(organism) if not data_dir: @@ -113,25 +129,29 @@ def main_script(organism, wd, data_dir, out, tissuetype=None): print("Getting Unique Results") unique_filter = UniqueFilter(ense.get_gtf(), ucsc.get_gtf(), tissues) if data_dir: - ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, tissues, out) + ResultSaver(unique_filter.get_results(), organism, data_dir, x_mappable, True, out) if x_mappable: CrossMapper(organism, data_dir, out, True) else: - ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, tissues, out) + ResultSaver(unique_filter.get_results(), organism, wd, x_mappable, False, out) if x_mappable: CrossMapper(organism, wd, out, False) + # Validate outputfile + + Validator(out) + if __name__ == '__main__': - # Argumentparser + # argument parser parser = argparse.ArgumentParser(description='GTF-Generator from UCSC Table Browser and Ensembl Regulatory Build' ) parser.add_argument('organism', help='Source organism [ hg19 | hg38 or mm9 | mm10 ]', action='store', nargs='?', type=str) parser.add_argument('--tissue', help='Tissue- or Celltype(s)', action='store', nargs='*', type=str) parser.add_argument('--wd', help='Working directory. default: "."', action='store', default=os.getcwd(), type=str) parser.add_argument('--dir', help='Data directory. default: "working_directory"', action='store', default="", type=str) - parser.add_argument('--out', help='Output directory: default: "."', action='store', default=".", type=str) + parser.add_argument('--out', help='Path to output file: default: "./organism.gtf"', action='store', default=".", type=str) args = vars(parser.parse_args()) # Check if organism exists diff --git a/bin/config/celltypes_homo_sapiens.json b/bin/3.1_create_gtf/config/celltypes_homo_sapiens.json similarity index 100% rename from bin/config/celltypes_homo_sapiens.json rename to bin/3.1_create_gtf/config/celltypes_homo_sapiens.json diff --git a/bin/config/celltypes_mus_musculus.json b/bin/3.1_create_gtf/config/celltypes_mus_musculus.json similarity index 100% rename from bin/config/celltypes_mus_musculus.json rename to bin/3.1_create_gtf/config/celltypes_mus_musculus.json diff --git a/bin/Modules/Ensembl/ActivityTableGenerator.py b/bin/Modules/Ensembl/ActivityTableGenerator.py deleted file mode 100644 index f110236..0000000 --- a/bin/Modules/Ensembl/ActivityTableGenerator.py +++ /dev/null @@ -1,25 +0,0 @@ -import gzip - - -class ATGenerator: - - """ - Reads saved activity binary files (table.bin) - """ - - def __init__(self, repre): - - self.representation = repre - - def read_table(self, file): - activity_table = [] - with gzip.open(file, 'rb') as f: - for line in f: - for index, re in enumerate(self.representation): - if re in str(line): - activity_table.append(index) - break - return bytearray(activity_table) - - - diff --git a/bin/Modules/Ensembl/FTPHandling/FTPEntry.py b/bin/Modules/Ensembl/FTPHandling/FTPEntry.py deleted file mode 100644 index 0d7bdc3..0000000 --- a/bin/Modules/Ensembl/FTPHandling/FTPEntry.py +++ /dev/null @@ -1,28 +0,0 @@ -import ftplib - - -class FTPEntry: - - """ - Class to determine if a ftp-path is file or directory. - """ - - def __init__(self, filename, ftpobj, startingdir=None): - self.filename = filename - if startingdir is None: - startingdir = ftpobj.pwd() - try: - ftpobj.cwd(filename) - self.filetype = 'd' - ftpobj.cwd(startingdir) - except ftplib.error_perm: - self.filetype = 'f' - - def gettype(self): - return self.filetype - - def getfilename(self): - return self.filename - - def __repr__(self): - return self.filename, self.filetype diff --git a/bin/Modules/Ensembl/checksums.py b/bin/Modules/Ensembl/checksums.py deleted file mode 100644 index 8f8b92a..0000000 --- a/bin/Modules/Ensembl/checksums.py +++ /dev/null @@ -1,23 +0,0 @@ -import hashlib - -# Python implementation of linux sum (BSD 16-bit Checksum) commandline tool. - -""" -Unused script with checksum implementations in Python -""" - -def bsdchecksum(infile): - with open(infile, 'rb') as f: - file_bytes = f.read() - c_sum = 0 - for char in file_bytes: - c_sum = (c_sum >> 1) + ((c_sum & 1) << 15) - c_sum += char - c_sum &= 0xffff - return c_sum - - -def md5_checksum(infile): - with open(infile, 'rb') as f: - file_bytes = f.read() - return hashlib.md5(file_bytes).hexdigest() diff --git a/bin/Modules/SaveResults.py b/bin/Modules/SaveResults.py deleted file mode 100644 index 66c2d97..0000000 --- a/bin/Modules/SaveResults.py +++ /dev/null @@ -1,29 +0,0 @@ -import os - - -class ResultSaver: - - """ - - class to save the results. Path is dependent on the data_dir, tissuetype and mapped = True or False. - The output is saved to the temp directory in the data folder if crossmapping is necessary. - - """ - - def __init__(self, results, organism, wd, mapped, is_data_dir, tissue, out): - - print("Save results to File !") - self.path = "" - if mapped: - if is_data_dir: - self.path = os.path.join(wd + "/temp/" + organism + ".gtf") - else: - self.path = os.path.join( wd + "/data/temp/" + organism + ".gtf" ) - elif tissue: - self.path = os.path.join(out+"/"+organism+"_filtered.gtf") - else: - self.path = os.path.join(out+"/"+organism+".gtf") - - with open(self.path, "w") as output_file: - for line in results: - output_file.write("\t".join(line)+"\n") diff --git a/bin/bed_to_fasta.R b/bin/bed_to_fasta.R deleted file mode 100644 index 84910f3..0000000 --- a/bin/bed_to_fasta.R +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env Rscript - -# Splitting BED-files depending on their cluster. -# The Sequences of each cluster are writen as an FASTA-file. -# @parameter bedInput BED-file with sequences and cluster-id as columns: Sequence: Column 7; ID:Column 8 -# @parameter prefix prefix for filenames -# @parameter min_seq min. number of sequences per cluster - -args = commandArgs(trailingOnly = TRUE) - -bedInput <- args[1] -prefix <- args[2] -min_seq <- args[3] - -bed <- data.table::fread(bedInput, header = FALSE, sep = "\t") - -clusters <- split(bed, bed$V11, sorted = TRUE, flatten = FALSE) # <---- Cluster column -discard <- lapply(1:length(clusters), function(i){ - clust <- as.data.frame(clusters[i]) - print(nrow(clust)) - if (nrow(clust) >= as.numeric(min_seq) ) { - sequences <- as.list(clust[[10]]) # <---- sequenze column - outfile <- paste0(prefix,"_cluster_",i,".FASTA") - seqinr::write.fasta(sequences = sequences, names = clust[[4]], file.out = outfile, as.string = TRUE) # <---- Name column - } else { - print(paste0("Cluster: ",i," is to small")) - } -}) diff --git a/bin/compareBed.sh b/bin/compareBed.sh deleted file mode 100644 index db58a2c..0000000 --- a/bin/compareBed.sh +++ /dev/null @@ -1,275 +0,0 @@ -#!/bin/bash -#data -#motifs -#workdir -#fasta -#min -#max -#output -wrong=() -da=false -mo=false -wo=false -fa=false -mi=false -ma=false -ou=false -he=false - -if [ $# -eq 0 ] -then - he=true -fi - -while [[ $# -gt 0 ]] -do -key="$1" - -case $key in - -d|--data) - data="$2" - da=true - shift - shift - ;; - -m|--motifs) - motifs="$2" - mo=true - shift - shift - ;; - -w|--workdir) - workdir="$2" - wo=true - shift - shift - ;; - -f|--fasta) - fasta="$2" - fa=true - shift - shift - ;; - -min|--min) - min=$2 - mi=true - shift - shift - ;; - -max|--max) - max=$2 - ma=true - shift - shift - ;; - -o|--output) - output="$2" - ou=true - shift - shift - ;; - -p|--path) - path="$2" - shift - shift - ;; - -h|--help) - help=true - he=true - shift - ;; - *) - wrong+=("$1") - shift - ;; -esac -done - -count=${#wrong[@]} -if [ $count -gt 0 ] -then - for i in ${wrong[@]} - do - echo wrong parameter $i - echo call script without parameters for help or call --help - echo exit - done -exit 1 -fi - -if [ $he == true ] -then - echo "This script utilies bedtools to select new footprints from data." - echo "Therefore the data is compared with known footprint motifs." - echo "The output is a new .bed file with added sequence information and a column with flags for better sequences (1)" - echo "Required arguments are data and motifs, both in bed-format, and the fasta genome sequences." - echo "If a parameter is chosen, a value must be provided or an error will occur." - echo "--------------------" - echo "usage: compareBed.sh --data --motifs --fasta [optional_parameter ] ..." - echo "--------------------" - echo " required parameter:" - echo " -d --data the path to the .bed file of the footprints" - echo " -m --motifs Either the path to the directory where the .bed files of the scanned motifs are stored" - echo " Or a comma seperated list of files, e.g.: path1/file1,path2/file2,path3/file3" - echo " -f --fasta the path to the .fasta file of genome" - echo " " - echo " optional parameter:" - echo " -w --workdir the path to directory where temporary files will be created" - echo " default is the current directory" - echo " -min --min minimum size of footprints; default is 10" - echo " -max --max maximum size of footprints; default is 80" - echo " -o --output output path/file ; default dir is workdir and filename is newMotifs.bed and newMotifs.bed.fasta" - echo " -h --help shows this help message" -exit 0 -fi - -echo selected parameters -echo ------------------- -echo data: $da \(required\) -echo motifs: $mo \(required\) -echo workdir: $wo -echo fasta: $fa \(required\) -echo min: $mi -echo max: $ma -echo output: $ou -echo help: $he - -if [ $da == false ] || [ $mo == false ] || [ $fa == false ] -then - echo required parameters not given. - echo required are: --data \ --motifs \ --fasta \ - exit 1 -fi - -if [ $wo == false ] -then - #if [ ! -d workdir1337 ] - #then - # mkdir workdir1337 - #fi - wo=true - workdir=$path -fi - -if [ $ou == false ] -then - output=${workdir}/"newMotifs.bed" - ou=true -fi - -if [ $mi == false ] -then - min=10 - mi=true -fi - -if [ $ma == false ] -then - max=80 - ma=true -fi - -if [ ! -d $workdir ] -then - mkdir $workdir -fi - -#1. first filter. no overlap vs. overlap -echo get sequences with no overlap -cat $data > "$workdir"/pass1Tr.bed -help=true - -if [ -d "$motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -else -declare -a motiffiles=(`echo $motifs | sed 's/,/ /g'`) -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools intersect -v -a "$workdir"/pass1Tr.bed -b $i > "$workdir"/pass1TrHelp.bed - else - help=true - bedtools intersect -v -a "$workdir"/pass1TrHelp.bed -b $i > "$workdir"/pass1Tr.bed - fi - echo $i -done -fi - -if [ $help == false ] -then - cat "$workdir"/pass1TrHelp.bed > "$workdir"/pass1Tr.bed -fi -echo get overlapping sequences -bedtools intersect -v -a $data -b "$workdir"/pass1Tr.bed > "$workdir"/pass1Fa.bed -cat "$workdir"/pass1Fa.bed | wc -l - -echo calc maxScore -#2. compute absolut maxscore position - -Rscript --vanilla $path/maxScore.R "$workdir"/pass1Fa.bed - -#3 subtract overlapping regions for additional motifs -#echo "also get subsequences with no overlap of overlapping sequences" -help=true - -if [ -d "$motifs" ] -then -for i in "$motifs"/*.bed -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -else -for i in ${motiffiles[@]} -do - if [ $help == true ] - then - help=false - bedtools subtract -a "$workdir"/pass1Fa.bed -b $i > "$workdir"/pass1FaHelp.bed - else - help=true - bedtools subtract -a "$workdir"/pass1FaHelp.bed -b $i > "$workdir"/pass1Fa.bed - fi - echo $i -done -fi - -if [ $help == false ] -then - cat "$workdir"/pass1FaHelp.bed > "$workdir"/pass2Tr.bed -else - cat "$workdir"/pass1Fa.bed > "$workdir"/pass2Tr.bed -fi - -#4. remove short/long motivs, make unique ids (relevant for some splitted tfbs from subtract) and handle maxScorePosition -# also creates a small output file with information about the comparison - -Rscript --vanilla $path/merge.R $min $max $workdir $data - -#5. add fasta sequences to bed and create fasta file -bedtools getfasta -fi $fasta -bed $workdir/merged.bed -bedOut > $output -bedtools getfasta -name -fi $fasta -bed "$output" -fo "$output".fasta - -#6 clean up -#rm "$workdir"/pass1Fa.bed "$workdir"/pass1Tr.bed "$workdir"/pass2Tr.bed "$workdir"/merged.bed "$workdir"/pass1FaHelp.bed "$workdir"/pass1TrHelp.bed diff --git a/bin/get_best_motif.py b/bin/get_best_motif.py deleted file mode 100644 index cc24949..0000000 --- a/bin/get_best_motif.py +++ /dev/null @@ -1,26 +0,0 @@ -# parses arguments using argparse -# @return args list of all parameters -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument("meme", help="Path to meme file") - parser.add_argument("output", help="Output file") - parser.add_argument("num", help="Number of motifs parsed from file") - args = parser.parse_args() - return args - -# write lines of file till certain line (MOTIF + [num]) -def main(): - args = parse_arguments() - out = open(args.output, "w+") - number = int(args.num) + 1 - motif = "MOTIF " + str(number) - with open(args.meme) as f: - for line in f: - if motif in line: - break - out.write(line) - - -if __name__ == "__main__": - import argparse - main() diff --git a/bin/maxScore.R b/bin/maxScore.R deleted file mode 100644 index 6b90984..0000000 --- a/bin/maxScore.R +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/Rscript -library(data.table) -args = commandArgs(TRUE) -file = args[1] - -tab = fread(file) -colnames(tab) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") - -tab$maxpos = tab$start + tab$maxpos - -fwrite(tab, file, row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge.R b/bin/merge.R deleted file mode 100644 index 129e41d..0000000 --- a/bin/merge.R +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/Rscript -library(data.table) -args=commandArgs(TRUE) -min=as.numeric(args[1]) -max=as.numeric(args[2]) -folder=args[3] - -splitted = fread(paste(folder, "/pass2Tr.bed", sep='')) -colnames(splitted) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") -p1 = fread(paste(folder, "/pass1Tr.bed", sep='')) -colnames(p1) = c("chromosome", "start", "stop", "id", "score", "length", "maxpos", "info") - -p1$maxpos = p1$start + p1$maxpos - -splitted=rbind(splitted, p1) - -splitted=splitted[which(splitted$stop - splitted$start >= min),] -splitted=splitted[which(splitted$stop - splitted$start <= max),] -splitted$id=make.unique(as.character(splitted$id)) -splitted$length=splitted$stop - splitted$start - -splitted=cbind(splitted, containsMaxpos=0) -splitted$containsMaxpos[intersect(which(splitted$start <= splitted$maxpos), which(splitted$stop > splitted$maxpos))] = 1 -splitted$maxpos = splitted$maxpos - splitted$start -data.table::fwrite(splitted, paste(folder, "/merged.bed", sep=''), row.names=FALSE, col.names=FALSE, quote=FALSE, sep='\t') - -before = fread(args[4], header=FALSE) - -sumb=sum(before$V3-before$V2) -suma=sum(splitted$length) -difference = formatC(sumb/suma, digits=4) -loss = formatC(1 - suma/sumb, digits=2) -lengthb = formatC(mean(before$V3-before$V2), digits=4) -lengtha = formatC(mean(splitted$length), digits=4) -stats=data.frame(sum_nt_input=sumb, sum_nt_filtered=suma, factor=difference, loss=loss, mean_length_input=lengthb, mean_length_filtered=lengtha, flag_1_ratio=length(which(splitted$containsMaxpos == 1))/dim(splitted)[1]) -write.table(stats, "../FilterMotifs.stats", row.names=FALSE, quote=FALSE, sep='\t') diff --git a/bin/merge_similar_clusters.R b/bin/merge_similar_clusters.R deleted file mode 100644 index 03b8cf1..0000000 --- a/bin/merge_similar_clusters.R +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env Rscript - -# Merging FASTA-files, which motifs are similar. -# -# @parameter tsv_in Path to TSV file generated by Tomtom. -# The input for Tomtom is a from all clusters merged meme-file. -# @parameter file_list Numerically sorted whitespace separated list of absolute fasta-file paths -# @parameter min_weight Minimum weight of edge allowed in graph clusters. - - -args = commandArgs(trailingOnly = TRUE) - -tsv_in <- args[1] -file_list <- args[2] -min_weight <- args[3] - -files <- unlist(as.list(strsplit(file_list, ","))) - -# split the string on the character "." in the first to columns and safe the last value each, to get the number of the cluster. -tsv <- data.table::fread(tsv_in, header = TRUE, sep = "\t",colClasses = 'character') -query_cluster <- unlist(lapply(strsplit(tsv[["Query_ID"]],"\\."), function(l){ - tail(l,n=1) -})) -target_cluster <- unlist(lapply(strsplit(tsv[["Target_ID"]],"\\."), function(l){ - tail(l,n=1) -})) - -# create data.table with only the cluster-numbers -sim_not_unique <- data.table::data.table(query_cluster,target_cluster) -# convert from character to numeric values -sim_not_unique[, query_cluster := as.numeric(query_cluster)] -sim_not_unique[, target_cluster := as.numeric(target_cluster)] - -# remove rows if column 1 is idential to column 2 -edgelist <- sim_not_unique[query_cluster != target_cluster] - -# create graph from data.frame -g <- igraph::graph_from_edgelist(as.matrix(edgelist)) -# converting graph to adjacency matrix -adj_matrix <- igraph::get.adjacency(g, names = T) -# generating weighted graph from adjacency matrix -g_adj <- igraph::graph_from_adjacency_matrix(adj_matrix, weighted = T) - -# get subgraphs from graph with edges of weight > min_weight -s1 <- igraph::subgraph.edges(g_adj, igraph::E(g_adj)[igraph::E(g_adj)$weight>min_weight], del=F) -png('motif_clusters.png') -plot(s1) -dev.off() -clust <- igraph::clusters(s1) -if (clust$no < 1){ - b <- lapply(files, function(f){ - system(paste("cat",f,">",basename(f))) - }) -} -# merge FASTA-files depending on the clustered graphs -a <- lapply(seq(from = 1, to = clust$no, by = 1), function(i){ - cl <- as.vector(which(clust$membership %in% c(i))) - fasta_list <- paste(files[cl], collapse = " ") - name <- paste0("Cluster_",i,".fasta") - system(paste("cat",fasta_list,">",name)) -}) diff --git a/bin/reduce_bed.R b/bin/reduce_bed.R deleted file mode 100644 index 2f8e8a2..0000000 --- a/bin/reduce_bed.R +++ /dev/null @@ -1,259 +0,0 @@ -#! /bin/Rscript -library("optparse") - -option_list <- list( - make_option(opt_str = c("-i", "--input"), default = NULL, help = "Input bed-file. Last column must be sequences.", metavar = "character"), - make_option(opt_str = c("-k", "--kmer"), default = 10, help = "Kmer length. Default = %default", metavar = "integer"), - make_option(opt_str = c("-m", "--motif"), default = 10, help = "Estimated motif length. Default = %default", metavar = "integer"), - make_option(opt_str = c("-o", "--output"), default = "reduced.bed", help = "Output file. Default = %default", metavar = "character"), - make_option(opt_str = c("-t", "--threads"), default = 1, help = "Number of threads to use. Use 0 for all available cores. Default = %default", metavar = "integer"), - make_option(opt_str = c("-c", "--clean"), default = TRUE, help = "Delete all temporary files. Default = %default", metavar = "logical"), - make_option(opt_str = c("-s", "--min_seq_length"), default = NULL, help = "Remove sequences below this length. Defaults to the maximum value of motif and kmer and can not be lower.", metavar = "integer", type = "integer"), - make_option(opt_str = c("-n", "--minoverlap_kmer"), default = NULL, help = "Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1", metavar = "integer", type = "integer"), - make_option(opt_str = c("-v", "--minoverlap_motif"), default = NULL, help = "Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2)", metavar = "integer", type = "integer"), - make_option(opt_str = c("-f", "--motif_occurence"), default = 1, help = "Number of motifs per sequence any value above 0. Default = %default.", metavar = "double") -) - -opt_parser <- OptionParser(option_list = option_list, - description = "Reduce sequences to frequent regions.") - -opt <- parse_args(opt_parser) - -#' Reduce bed file to conserved regions -#' -#' @param input bed file -#' @param kmer Length of kmer. -#' @param motif Estimated motif length. -#' @param output Output file -#' @param threads Number of threads. Default = 1. 0 for all cores. -#' @param clean Delete all temporary files. -#' @param minoverlap_kmer Minimum required overlap between kmer to merge kmer. Used to create reduced sequence ranges. Can not be greater than kmer length. Default = kmer - 1 -#' @param minoverlap_motif Minimum required overlap between motif and kmer to consider kmer significant. Used for kmer cutoff calculation. Can not be greater than motif and kmer length. Default = ceiling(motif / 2) -#' @param min_seq_length Must be greater or equal to kmer and motif. Default = max(c(motif, kmer)). -#' @param motif_occurence Define how many motifs are expected per sequence. This value is used during kmer cutoff calculation. Default = 1 meaning that there should be approximately one motif per sequence. -#' -#' @return reduced bed -#' TODO check whether jellyfish is installed -reduce_bed <- function(input, kmer = 10, motif = 10, output = "reduced.bed", threads = NULL, clean = TRUE, minoverlap_kmer = kmer - 1, minoverlap_motif = ceiling(motif / 2), min_seq_length = max(c(motif, kmer)), motif_occurence = 1) { - # get number of available cores - if (threads == 0) { - threads <- parallel::detectCores() - } - - message("Loading bed...") - # load bed - # columns: chr, start, end, name, ..., sequence - bed_table <- data.table::fread(input = input, header = FALSE) - names(bed_table)[1:4] <- c("chr", "start", "end", "name") - names(bed_table)[ncol(bed_table)] <- "sequence" - # index - data.table::setkey(bed_table, name, physical = FALSE) - - # check for duplicated names - if (anyDuplicated(bed_table[, "name"])) { - warning("Found duplicated names. Making names unique.") - bed_table[, name := make.unique(name)] - } - - # remove sequences below minimum length - if (min_seq_length < max(c(kmer, motif))) { - stop("Minimum sequence length must be greater or equal to ", max(c(motif, kmer)), " (maximum value of kmer and motif).") - } - - total_rows <- nrow(bed_table) - bed_table <- bed_table[nchar(sequence) > min_seq_length] - if (total_rows > nrow(bed_table)) { - message("Removed ", total_rows - nrow(bed_table), " sequence(s) below minimum length of ", min_seq_length) - } - - # TODO forward fasta file as parameter so no bed -> fasta conversion is needed. - message("Writing fasta...") - # save as fasta - fasta_file <- paste0(basename(input), ".fasta") - seqinr::write.fasta(sequences = as.list(bed_table[[ncol(bed_table)]]), names = bed_table[[4]], as.string = TRUE, file.out = fasta_file) - - message("Counting kmer...") - # count k-mer - hashsize <- 4 ^ kmer - count_output_binary <- "mer_count_binary.jf" - input <- fasta_file - jellyfish_call <- paste("jellyfish count ", "-m", kmer, "-s", hashsize, "-o", count_output_binary, input) - - system(command = jellyfish_call, wait = TRUE) - - mer_count_table <- "mer_count_table.jf" - jellyfish_dump_call <- paste("jellyfish dump --column --tab --output", mer_count_table, count_output_binary) - - system(command = jellyfish_dump_call, wait = TRUE) - - message("Reduce kmer.") - # load mer table - # columns: kmer, count - kmer_counts <- data.table::fread(input = mer_count_table, header = FALSE) - # order kmer descending - data.table::setorder(kmer_counts, -V2) - - # compute number of hits to keep - keep_hits <- significant_kmer(bed_table, kmer = kmer, motif = motif, minoverlap = minoverlap_motif, motif_occurence = motif_occurence) - - # reduce kmer - reduced_kmer <- reduce_kmer(kmer = kmer_counts, significant = keep_hits) - - message("Find kmer in sequences.") - # find k-mer in sequences - # columns: name, start, end, width - ranges_table <- find_kmer_regions(bed = bed_table, kmer_counts = reduced_kmer, minoverlap = minoverlap_kmer, threads = threads) - names(ranges_table)[1:2] <- c("relative_start", "relative_end") - - # merge ranged_table with bed_table + keep column order - merged <- merge(x = bed_table, y = ranges_table, by = "name", sort = FALSE)[, union(names(bed_table), names(ranges_table)), with = FALSE] - - # delete sequences without hit - merged <- na.omit(merged, cols = c("relative_start", "relative_end")) - message("Removed ", nrow(bed_table) - nrow(merged), " sequence(s) without hit.") - - message("Reduce sequences.") - # create subsequences - merged[, sequence := stringr::str_sub(sequence, relative_start, relative_end)] - - # bed files count from 0 - merged[, `:=`(relative_start = relative_start - 1, relative_end = relative_end - 1)] - # change start end location - merged[, `:=`(start = start + relative_start, end = start + relative_end)] - - # clean table - merged[, `:=`(relative_start = NULL, relative_end = NULL, width = NULL)] - - if (clean) { - file.remove(fasta_file, count_output_binary, mer_count_table) - } - - data.table::fwrite(merged, file = output, sep = "\t", col.names = FALSE) -} - -#' Predict how many interesting kmer are possible for the given data. -#' -#' @param bed Bed table with sequences in last column -#' @param kmer Length of kmer -#' @param motif Length of motif -#' @param minoverlap Minimum number of bases overlapping between kmer and motif. Must be <= motif & <= kmer. Defaults to ceiling(motif / 2). -#' @param motif_occurence Define how many motifs are expected per sequence. Default = 1 -#' -#' @return Number of interesting kmer. -significant_kmer <- function(bed, kmer, motif, minoverlap = ceiling(motif / 2), motif_occurence = 1) { - if (minoverlap > kmer || minoverlap > motif) { - stop("Kmer & motif must be greater or equal to minoverlap!") - } - if (motif_occurence <= 0) { - stop("Motif_occurence must be a numeric value above 0!") - } - - # minimum sequence length to get all interesting overlaps - min_seq_length <- motif + 2 * (kmer - minoverlap) - - seq_lengths <- nchar(bed[[ncol(bed)]]) - - # reduce to max interesting length - seq_lengths <- ifelse(seq_lengths > min_seq_length, min_seq_length, seq_lengths) - - # calculate max possible kmer - topx <- sum(seq_lengths - kmer + 1) - - return(topx * motif_occurence) -} - -#' Orders kmer table after count descending and keeps all kmer with a cumulative sum below the given significance threshold. -#' -#' @param kmer Kmer data.table columns: kmer, count -#' @param significant Value from significant_kmer function. -#' -#' @return reduced data.table -reduce_kmer <- function(kmer, significant) { - data.table::setorderv(kmer, cols = names(kmer)[2], order = -1) - - kmer[, cumsum := cumsum(V2)] - - return(kmer[cumsum <= significant]) -} - -#' create list of significant ranges (one for each bed entry) -#' -#' @param bed Data.table of bed with sequence in last column -#' @param kmer_counts Data.table of counted kmer. Column1 = kmer, column2 = count. -#' @param minoverlap Minimum overlapping nucleotides between kmers to be merged. Positive integer. Must be smaller than kmer length. -#' @param threads Number of threads. -#' -#' @return Data.table with relative positions and width (start, end, width). -#' -#' TODO Include number of motifs per sequence (aka motif_occurence). Attempt to keep best 2 regions for occurence = 2? Probably high impact on performance. -find_kmer_regions <- function(bed, kmer_counts, minoverlap = 1 , threads = NULL) { - if (nchar(kmer_counts[1, 1]) <= minoverlap) { - stop("Minoverlap must be smaller than kmer length!") - } - - names(kmer_counts)[1:2] <- c("kmer", "count") - data.table::setorder(kmer_counts, -count) - - seq_ranges <- pbapply::pblapply(seq_len(nrow(bed)), cl = threads, function(x) { - seq <- bed[x][[ncol(bed)]] - name <- bed[x][[4]] - - #### locate ranges - ranges <- data.table::data.table(do.call(rbind, stringi::stri_locate_all_fixed(seq, pattern = kmer_counts[[1]]))) - - ranges <- na.omit(ranges, cols = c("start", "end")) - - if (nrow(ranges) < 1) { - return(data.table::data.table(start = NA, end = NA, width = NA, name = name)) - } - - # add kmer sequences - ranges[, sub_seq := stringr::str_sub(seq, start, end)] - # add kmer count - ranges[, count := kmer_counts[ranges[["sub_seq"]], "count", on = "kmer"]] - - #### reduce ranges - reduced_ranges <- IRanges::IRanges(start = ranges[["start"]], end = ranges[["end"]], names = ranges[["sub_seq"]]) - - # list of overlapping ranges - edge_list <- as.matrix(IRanges::findOverlaps(reduced_ranges, minoverlap = minoverlap, drop.self = FALSE, drop.redundant = TRUE)) - - # get components (groups of connected ranges) - graph <- igraph::graph_from_edgelist(edge_list, directed = FALSE) - # vector of node membership (indices correspond to ranges above) - member <- as.factor(igraph::components(graph)$membership) - - # list of membership vectors - node_membership <- lapply(levels(member), function(x) { - which(member == x) - }) - - # calculate component score (= sum of kmer count) - score <- vapply(node_membership, FUN.VALUE = numeric(1), function(x) { - sum(kmer_counts[x, "count"]) - }) - - selected_ranges <- node_membership[[which(score == max(score))[1]]] - - # reduce selected ranges - reduced_ranges <- IRanges::reduce(reduced_ranges[selected_ranges]) - - reduced_ranges <- data.table::as.data.table(reduced_ranges)[, name := name] - - return(reduced_ranges) - }) - - # create ranges table - conserved_regions_table <- data.table::rbindlist(seq_ranges) - - return(conserved_regions_table) -} - -# call function with given parameter if not in interactive context (e.g. run from shell) -if (!interactive()) { - # show apply progressbar - pbo <- pbapply::pboptions(type = "timer") - # remove last parameter (help param) - params <- opt[-length(opt)] - do.call(reduce_bed, args = params) -} diff --git a/masterenv.yml b/masterenv.yml index 211f4e1..751964e 100644 --- a/masterenv.yml +++ b/masterenv.yml @@ -1,6 +1,7 @@ name: masterenv dependencies: + - python >=3 - r-seqinr - numpy - pybigWig @@ -14,8 +15,6 @@ dependencies: - r-stringr - r-optparse - bioconductor-iranges - - snakemake - - meme - moods - biopython - pybedtools diff --git a/pipeline.nf b/pipeline.nf index a39616a..93190a3 100644 --- a/pipeline.nf +++ b/pipeline.nf @@ -9,6 +9,7 @@ params.tfbs_path="" params.create_known_tfbs_path = "./" params.help = 0 + params.gtf_path="" params.out = "./out/" //peak_calling @@ -21,7 +22,7 @@ params.max_size_fp=100 //clustering - //reduce_bed + //reduce_sequence params.kmer=10 params.aprox_motif_len=10 params.motif_occurence=1 @@ -55,10 +56,10 @@ params.best_motif = 3 // Top n motifs per cluster //creating_gtf - params.organism="hg38" + params.organism="" params.tissue="" -if (params.bigwig == "" || params.bed == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ +if (params.bigwig == "" || params.bed == "" || params.organism == "" || params.genome_fasta == "" || params.motif_db == "" || params.config == "" || "${params.help}" != "0"){ log.info """ Usage: nextflow run pipeline.nf --bigwig [BigWig-file] --bed [BED-file] --genome_fasta [FASTA-file] --motif_db [MEME-file] --config [UROPA-config-file] @@ -68,14 +69,16 @@ Required arguments: --genome_fasta Path to genome in FASTA-format --motif_db Path to motif-database in MEME-format --config Path to UROPA configuration file - --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. - Path can be set as tfbs_path in next run. (Default: './') + --organism Input organism [hg38 | hg19 | mm9 | mm10] --out Output Directory (Default: './out/') Optional arguments: --help [0|1] 1 to show this help message. (Default: 0) --tfbs_path Path to directory with output from tfbsscan. If given tfbsscan will not be run. + --create_known_tfbs_path Path to directory where output from tfbsscan (known motifs) are stored. + Path can be set as tfbs_path in next run. (Default: './') + --gtf_path Path to gtf-file. If path is set the process which creats a gtf-file is skipped. Footprint extraction: --window_length INT This parameter sets the length of a sliding window. (Default: 200) @@ -115,7 +118,6 @@ Optional arguments: --motif_similarity_thresh FLOAT Threshold for motif similarity score (Default: 0.00001) Creating GTF: - --organism [hg38 | hg19 | mm9 | mm10] Input organism --tissues List/String List of one or more keywords for tissue-/category-activity, categories must be specified as in JSON config All arguments can be set in the configuration files @@ -190,8 +192,8 @@ process footprint_extraction { conda "${path_env}" tag{name} - publishDir "${params.out}/footprint_extraction/", mode: 'copy', pattern: '*.bed' - publishDir "${params.out}/footprint_extraction/log", mode: 'copy', pattern: '*.log' + publishDir "${params.out}/1.1_footprint_extraction/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/1.1_footprint_extraction/log", mode: 'copy', pattern: '*.log' input: set name, file (bigWig), file (bed) from footprint_in @@ -201,7 +203,7 @@ process footprint_extraction { script: """ - python ${path_bin}/footprints_extraction.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} + python ${path_bin}/1.1_footprint_extraction/footprints_extraction.py --bigwig ${bigWig} --bed ${bed} --output_file ${name}_called_peaks.bed --window_length ${params.window_length} --step ${params.step} --percentage ${params.percentage} """ } @@ -214,7 +216,7 @@ process extract_known_TFBS { conda "${path_env}" - publishDir "${params.out}/known_TFBS/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/1.2_filter_motifs/TFBSscan/", mode: 'copy', pattern: '*.bed' input: set file (fasta), file (db), file (bed) from for_tfbs @@ -227,7 +229,7 @@ process extract_known_TFBS { script: """ - python ${path_bin}/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} + python ${path_bin}/1.2_filter_motifs/tfbsscan.py --use moods --core ${params.threads} -m ${db} -g ${fasta} -o ${params.create_known_tfbs_path} -b ${bed} """ } @@ -249,29 +251,30 @@ if(params.tfbs_path == "") { */ process overlap_with_known_TFBS { conda "${path_env}" - - publishDir "${params.out}/unknown_overlap/", mode :'copy' + publishDir "${params.out}/1.2_filter_motifs/compareBed/", mode :'copy' input: set name, file (bed_footprints), val (bed_motifs), file (fasta) from for_overlap output: set name, file ("${name}_unknown.bed") into bed_for_reducing + file ('*.stats') script: motif_list = bed_motifs.toString().replaceAll(/\s|\[|\]/,"") """ - ${path_bin}/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} -p ${path_bin} + ${path_bin}/1.2_filter_motifs/compareBed.sh --data ${bed_footprints} --motifs ${motif_path} --fasta ${fasta} -o ${name}_unknown.bed -min ${params.min_size_fp} -max ${params.max_size_fp} """ } /* +Reduce each sequence to its most conserved region. */ -process reduce_bed { +process reduce_sequence { conda "${path_env}" echo true - publishDir "${params.out}/cluster/reduced_bed/", mode: 'copy' + publishDir "${params.out}/2.1_clustering/reduced_bed/", mode: 'copy' input: set name, file (bed) from bed_for_reducing @@ -281,18 +284,19 @@ process reduce_bed { script: """ - Rscript ${path_bin}/reduce_bed.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} + Rscript ${path_bin}/2.1_clustering/reduce_sequence.R -i ${bed} -k ${params.kmer} -m ${params.aprox_motif_len} -o ${name}_reduced.bed -t ${params.threads} -f ${params.motif_occurence} -s ${params.min_seq_length} """ } /* +Cluster all sequences. Appends a column with cluster numbers to the bed-file. */ process clustering { conda "${path_env}" echo true - publishDir "${params.out}/cluster/", mode: 'copy', pattern: '*.bed' + publishDir "${params.out}/2.1_clustering/", mode: 'copy', pattern: '*.bed' input: set name, file (bed) from bed_for_clustering @@ -302,7 +306,7 @@ process clustering { script: """ - Rscript ${path_bin}/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} + Rscript ${path_bin}/2.1_clustering/cdhit_wrapper.R -i ${bed} -A ${params.sequence_coverage} -o ${name}_clusterd.bed -c ${params.identity} -G ${params.global} -M ${params.memory} -l ${params.throw_away_seq} -r ${params.strand} -T ${params.threads} """ } @@ -311,8 +315,9 @@ process clustering { Converting BED-File to one FASTA-File per cluster */ process bed_to_clustered_fasta { + conda "${path_env}" - publishDir "${params.out}/esimated_motifs/clustered_motifs/clustered_fasta/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/fasta/", mode: 'copy' tag{name} input: @@ -324,7 +329,7 @@ process bed_to_clustered_fasta { script: """ - Rscript ${path_bin}/bed_to_fasta.R ${bed} ${name} ${params.min_seq} + Rscript ${path_bin}/2.2_motif_estimation/bed_to_fasta.R -i ${bed} -p ${name} -m ${params.min_seq} """ } @@ -341,14 +346,16 @@ Generating Motifs through alignment and scoring best local matches. process glam2 { tag{name} - publishDir "${params.out}/esimated_motifs/clustered_motifs/${name}/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/glam2/", mode: 'copy' + conda "${path_env}" input: set name, file (fasta) from fasta_for_glam2 - output: + output: file("${name}/*.meme") into meme_to_merge set name, file("${name}/*.meme") into meme_for_tomtom + set name, file("${name}/*.meme") into meme_to_search_in_merged set name, file("${name}/*.meme") into meme_for_filter file ('*') @@ -360,11 +367,12 @@ process glam2 { /* Combining all MEME-files to one big MEME-file. -The paths are sorted numerically depending on the cluster number. +The paths are sorted numerically depending on the cluster ID. */ process merge_meme { - publishDir "${params.out}/esimated_motifs/merged_meme/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_meme/", mode: 'copy' + conda "${path_env}" input: val (memelist) from meme_to_merge.toList() @@ -376,48 +384,104 @@ process merge_meme { params.cluster_motif == 1 script: + //sorting memes = memelist.collect{it.toString().replaceAll(/\/glam2.meme/,"") } meme_sorted = memes.sort(false) { it.toString().tokenize('_')[-1] as Integer } meme_sorted_full = meme_sorted.collect {it.toString() + "/glam2.meme"} + //create list for bash meme_list = meme_sorted_full.toString().replaceAll(/\,|\[|\]/,"") """ meme2meme ${meme_list} > merged_meme.meme """ } +to_find_similar_motifs = meme_to_search_in_merged.combine(merged_meme) + /* Running Tomtom on merged meme-files. Output table has the information which clusters are similar to each other. */ process find_similar_motifs { - publishDir "${params.out}/esimated_motifs/cluster_similarity/", mode: 'copy' + tag{name} + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/cluster_similarity/", mode: 'copy' + conda "${path_env}" + input: - file (merged_meme) from merged_meme + set name, file (meme) ,file (merged_meme) from to_find_similar_motifs output: - file ('all_to_all.tsv') into motif_similarity + set name, file ("${name}.tsv") into motif_similarity when: params.cluster_motif == 1 script: """ - tomtom ${merged_meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > all_to_all.tsv + tomtom ${meme} ${merged_meme} -thresh ${params.motif_similarity_thresh} -text --norc | sed '/^#/ d' | sed '/^\$/d' > ${name}.tsv """ } +/* +Label first column of TSV-file with Cluster ID +*/ +process label_cluster { + + tag{name} + conda "${path_env}" + + input: + set name, file (tsv) from motif_similarity + + output: + file ("${name}_labeled.tsv") into labeled_tsv + + when: + params.cluster_motif == 1 + + script: + """ + Rscript ${path_bin}/2.2_motif_estimation/label_cluster.R -i ${tsv} -o ${name}_labeled.tsv + """ +} + +/* +Merging tsv files_for_merge_fasta +*/ +process merge_labeled_tsv { + + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/", mode: 'copy' -files_for_merge_fasta = motif_similarity.combine(fasta_for_motif_cluster) + input: + val (tsvlist) from labeled_tsv.toSortedList { it.toString().tokenize('_')[-2] as Integer } + + output: + file ('*.tsv') into merged_labeled_tsv + + when: + params.cluster_motif == 1 + + script: + tsvs = tsvlist.toString().replaceAll(/\,|\[|\]/,"") + """ + echo -e 'Query_ID\tTarget_ID\tOptimal_offset\tp-value\tE-value\tq-value\tOverlap\tQuery_consensus\tTarget_consensus\tOrientation'> merged_labeled.tsv + for i in $tsvs; do + cat \$i >> merged_labeled.tsv + done + """ +} + +files_for_merge_fasta = merged_labeled_tsv.combine(fasta_for_motif_cluster) /* Merging FASTA-files of similar clusters */ process merge_fasta { + conda "${path_env}" - publishDir "${params.out}/esimated_motifs/merged_fasta/", mode: 'copy' - echo true + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/merged_fasta/", mode: 'copy' + input: set file (motiv_sim), val (fasta_list) from files_for_merge_fasta @@ -432,17 +496,21 @@ process merge_fasta { fa_sorted = fasta_list.sort(false) { it.getBaseName().tokenize('_')[-1] as Integer } fastalist = fa_sorted.toString().replaceAll(/\s|\[|\]/,"") """ - Rscript ${path_bin}/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} + Rscript ${path_bin}/2.2_motif_estimation/merge_similar_clusters.R ${motiv_sim} ${fastalist} ${params.edge_weight} """ } motif_clustered_fasta_flat = motif_clustered_fasta_list.flatten() - +/* +Run GLAM2 on emrged FASTA-files +*/ process clustered_glam2 { - publishDir "${params.out}/final_esimated_motifs/${name}/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/cluster_motifs/glam2/${name}/", mode: 'copy' + tag{name} + conda "${path_env}" input: file (fasta) from motif_clustered_fasta_flat @@ -462,6 +530,10 @@ process clustered_glam2 { """ } +/* +Forward files depending on set parameter +If motif clustering is activated or not. +*/ if(params.cluster_motif == 1){ for_tomtom = clustered_meme_for_tomtom for_filter = clustered_meme_for_filter @@ -478,7 +550,8 @@ Tomtom searches motifs in databases. process tomtom { tag{name} - publishDir "${params.out}/esimated_motifs/tomtom/", mode: 'copy' + publishDir "${params.out}/2.2_motif_estimation/tomtom/", mode: 'copy' + conda "${path_env}" input: set name, file (meme) from for_tomtom @@ -523,9 +596,10 @@ process check_for_unknown_motifs { //Get the best(first) Motif from each MEME-file process get_best_motif { - conda "${path_env}" - publishDir "${params.out}/esimated_motifs/unknown_motifs/", mode: 'copy' + conda "${path_env}" + tag{name} + publishDir "${params.out}/2.2_motif_estimation/best_unknown_motifs/", mode: 'copy' input: set name, file(meme), file(tsv) from meme_for_scan @@ -535,7 +609,7 @@ process get_best_motif { script: """ - python ${path_bin}/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} + python ${path_bin}/2.2_motif_estimation/get_best_motif.py ${meme} ${name}_best.meme ${params.best_motif} """ } @@ -575,17 +649,26 @@ process cluster_quality { process create_GTF { conda "${path_env}" - publishDir "${params.out}/gtf/", mode: 'copy' + publishDir "${params.out}/3.1_create_gtf/", mode: 'copy' output: - file ('*.gtf') into gtf_for_uropa + file ('*.gtf') into gtf + + when: + params.gtf_path == "" script: """ - python ${path_bin}/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin} + python ${path_bin}/3.1_create_gtf/RegGTFExtractor.py ${params.organism} --tissue ${params.tissues} --wd ${path_bin}/3.1_create_gtf """ } +if (params.gtf_path == "") { + gtf_for_uropa = gtf +} else { + gtf_for_uropa = Channel.fromPath(params.gtf_path) +} + /* bed_for_final_filter.combine(gtf_for_uropa).set {uropa_in}