Merge branch 'dev' into estimation_motifs

loosolab · Jan 8, 2019 · 5b8bb7a · 5b8bb7a
2 parents ede935b + be868ef
commit 5b8bb7a
Show file tree

Hide file tree

Showing 42 changed files with 700 additions and 5,853 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,208 @@
+# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+### R template
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+/*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
+rsconnect/
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+Würde bin/3.1_create_gtf/data/ löschen
+Würde data/ löschen
diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py
@@ -43,6 +43,7 @@ def parse_args():
 	parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
 	parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
 	parser.add_argument('--percentage',  default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
+	parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
 	parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
 	args = parser.parse_args()
 
@@ -154,6 +155,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 				old_start = peak_footprints[existing_footprint_name]['start']
 				old_end = peak_footprints[existing_footprint_name]['end']
 				old_score = peak_footprints[existing_footprint_name]['score']
+				old_length = peak_footprints[existing_footprint_name]['len']
 
 				if footprint_start >= old_start and footprint_start <= old_end: #the start of the new footprint is between the start and end of an old footprint
 					if footprint_end > old_end: #the new footprint is not completely inside the old one
@@ -163,19 +165,21 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 
 						peak_footprints[existing_footprint_name]['end'] = footprint_end
 						peak_footprints[existing_footprint_name]['score'] = footprint_score
+						peak_footprints[existing_footprint_name]['len'] = footprint_end - old_start
 						#we can not update the max_pos as we do not have the information about scores array of the existing footprint
 					#else: the new footprint is completely inside the old one, do nothing
 					save_current_footprint = False
 					break
 
 				elif footprint_end >= old_start and footprint_end <= old_end: #the end of the new footprint is between the start and end of an old footprint
-					if footprint_start < old_start: #the new footprint is not completely inside the old one
+					if footprint_start < old_start: #the new footprint is not completely inside the old one						
 						#update the information about the existing footprint
 						#find the average of both scores
 						footprint_score = (peak_footprints[existing_footprint_name]['score'] + footprint_score) / 2
 
 						peak_footprints[existing_footprint_name]['start'] = footprint_start
 						peak_footprints[existing_footprint_name]['score'] = footprint_score
+						peak_footprints[existing_footprint_name]['len'] = old_end - footprint_start
 					#else do nothing
 					save_current_footprint = False
 					break
@@ -191,7 +195,7 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 
 		if save_current_footprint == True:
 			#make sure to save this footprint			
-			footprint_name = "footprint_" + str(footprint_count)
+			footprint_name = "f_" + str(footprint_count)
 
 			peak_footprints[footprint_name] = peak_footprints.get(footprint_name, {})
 			peak_footprints[footprint_name] = {'chromosom': chromosom, 'start': footprint_start, 'end': footprint_end, 'score': footprint_score, 'len': len(footprint_scores), 'bonus': bonus_info_from_bed, 'max_pos': max_pos}
@@ -270,11 +274,62 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
 
 	return peak_footprints, footprint_count
 
+#this function checks if there are footprints with a small amount of bp in between, and if so, merges them together
+#if at this point there are still overlaps of footprints, this function will delete them
+#the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints
+#the output is the renewed dictionary containing only the best footprints for the output file
+def check_and_merge(peak_footprints, max_bp_between):
+	peak_footprints_new = {}
+
+	for footprint_to_check in peak_footprints.keys():
+		start_to_check = peak_footprints[footprint_to_check]['start']
+		end_to_check = peak_footprints[footprint_to_check]['end']
+
+		merge_footprints_left = None
+
+		for compared_footprint in peak_footprints.keys():
+
+			if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between:
+				#make compared_footprint longer
+				merge_footprints_left = False
+				break
+			elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between:
+				#make footprint_to_check longer
+				merge_footprints_left = True
+				break
+
+		if merge_footprints_left: #if the merging left is enabled
+			#check if this footprint can be merged with the compared_footprint
+			#if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now
+			if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): 
+				#update the start position
+				peak_footprints_new[compared_footprint]['start'] = start_to_check
+				#update the length
+				peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
+				#update the score
+				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
+
+		elif merge_footprints_left == False: #otherwise merge right
+			#check if the merging is possible
+			if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
+				#update the end position
+				peak_footprints_new[compared_footprint]['end'] = end_to_check
+				#update the length
+				peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
+				#update the score
+				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
+
+		else: #save the current footprint, as it should not be merged
+			peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
+			peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]
+
+	return peak_footprints_new
+
 #this function uses the information provided from the .bed file to look for footprints within the peaks of interest
 #as input the information from the original bed file, as well as bigwig file is needed
 #the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
 #the output of this function is a dictionary contains all the found footprints ready to write to the output file
-def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage):
+def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between):
 
 	logger.info('Looking for footprints within peaks...')
 
@@ -295,9 +350,11 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage)
 			peak_end = int(positions[1])
 
 			scores_in_peak = np.nan_to_num(np.array(list(bw_open.values(chromosom, peak_start, peak_end)))) #save the scores to an array
-
 			peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)
 
+			#double check for overlaps and possibly merging of footprints having up to 5 bp in between
+			peak_footprints = check_and_merge(peak_footprints, max_bp_between)
+
 			for footprint_name in peak_footprints.keys():
 				all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
 				all_footprints[footprint_name] = peak_footprints[footprint_name]
@@ -381,7 +438,7 @@ def main():
 	logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))
 
 	bed_dictionary = make_bed_dictionary(args.bed)
-	all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage)
+	all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between)
 	write_to_bed_file(all_footprints, args.output_file)
 
 	logger.info("the number of peaks: " + str(len(bed_dictionary)))