Merge branch 'dev' into estimation_motifs

loosolab · Jan 18, 2019 · c81a8d8 · c81a8d8
2 parents ca5ff36 + 89bc089
commit c81a8d8
Show file tree

Hide file tree

Showing 7 changed files with 151 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -75,7 +75,7 @@ Optional arguments:
 	--window_length INT	This parameter sets the length of a sliding window. (Default: 200)
 	--step INT		This parameter sets the number of positions to slide the window forward. (Default: 100)
 	--percentage INT	Threshold in percent (Default: 0)
-	--max_bp_between INT	If footprints are less than X bases appart the footprints will be merged (Default: 6)
+	--min_gap INT		If footprints are less than X bases apart the footprints will be merged (Default: 6)
 
 	Filter motifs:
 	--min_size_fp INT	Minimum sequence length threshold. Smaller sequences are discarded. (Default: 10)

diff --git a/bin/1.1_footprint_extraction/footprints_extraction.py b/bin/1.1_footprint_extraction/footprints_extraction.py
@@ -44,7 +44,7 @@ def parse_args():
 	parser.add_argument('--window_length', default=200, type=int, help='Please enter the length for a window, by defauld 200 bp.')
 	parser.add_argument('--step', default=100, type=int, help='Please enter a step to move the window, by default 100 bp.')
 	parser.add_argument('--percentage',  default=0, type=int, help='Please enter a percentage to be added to background while searching for footprints, by default 0%%.')
-	parser.add_argument('--max_bp_between', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
+	parser.add_argument('--min_gap', default=6, type=int, help='Please enter the number of bp allowed to be in between two footprints, by default 6 bp.')
 	parser.add_argument('--silent', action='store_true', help='While working with data write the information only into ./footprints_extraction.log.')
 	args = parser.parse_args()
 
@@ -118,7 +118,7 @@ def make_bed_dictionary(bed_file):
 #an array containing scores (signals) from the bigwig file; the footprints already saved for the particular peak; information for the bed file:
 #chromosom, start and end position, as well as additional information from the original bed file
 #the function returns the count for footprints, as well as footprints for the current peak
-def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed):
+def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom, footprint_start, footprint_end, bonus_info_from_bed): 
 
 	save_current_footprint = False
 
@@ -143,6 +143,8 @@ def save_footprint(footprint_count, footprint_scores, peak_footprints, chromosom
 		else:
 			max_pos = first_max_pos
 
+		max_pos = max_pos + 1 #as the index of an array starts with 0
+
 		#calculate the score for the current footprint as mean of all scores from the bigwig file
 		footprint_score = np.mean(footprint_scores)
 
@@ -279,58 +281,139 @@ def search_in_window(peak_footprints, footprint_count, chromosom, peak_start, pe
 #if at this point there are still overlaps of footprints, this function will delete them
 #the input parameter are: dictionary with footprints within one peak, and the max number of bp allowed to be in between the footprints
 #the output is the renewed dictionary containing only the best footprints for the output file
-def check_and_merge(peak_footprints, max_bp_between):
+def check_and_merge(peak_footprints, min_gap):
+	#to ensure the merging works well, sort the footprints first arter start and end positions
+	#the sort can not be applied to a dictionary, we are making a list out of peak_footprints_dict
+	peak_footprints_list = sorted(peak_footprints.items(), key = lambda x : (x[1]['start'], x[1]['end']), reverse = False)
+
 	peak_footprints_new = {}
+	merged_footprints = {}
 
-	for footprint_to_check in peak_footprints.keys():
+	#we need to check each footprint within this peak with the other footprints for possible merging
+	#for footprint_to_check in peak_footprints.keys():
+	for footprint in peak_footprints_list: #work with sorted footprints
+		footprint_to_check = footprint[0] #save the name of the footprint which we are working with now
 		start_to_check = peak_footprints[footprint_to_check]['start']
 		end_to_check = peak_footprints[footprint_to_check]['end']
 
 		merge_footprints_left = None
 
 		for compared_footprint in peak_footprints.keys():
 
-			if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < max_bp_between:
-				#make compared_footprint longer
+			if start_to_check > peak_footprints[compared_footprint]['start'] and start_to_check - peak_footprints[compared_footprint]['end'] < min_gap:
+				#make compared_footprint longer: compared_footprint + footprint_to_check
 				merge_footprints_left = False
 				break
-			elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < max_bp_between:
-				#make footprint_to_check longer
+			elif end_to_check < peak_footprints[compared_footprint]['end'] and peak_footprints[compared_footprint]['start'] - end_to_check < min_gap:
+				#make footprint_to_check longer: footprint_to_check + compared footprint
 				merge_footprints_left = True
 				break
 
-		if merge_footprints_left: #if the merging left is enabled
-			#check if this footprint can be merged with the compared_footprint
-			#if compared footprint is not in peak_footprint_new.keys(), the next loop will check for this footprint. There is no need for doulbe check now
-			if start_to_check < peak_footprints[compared_footprint]['start'] and compared_footprint in peak_footprints_new.keys(): 
-				#update the start position
-				peak_footprints_new[compared_footprint]['start'] = start_to_check
-				#update the length
-				peak_footprints_new[compared_footprint]['len'] = peak_footprints[compared_footprint]['end'] - start_to_check
-				#update the score
-				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
-
-		elif merge_footprints_left == False: #otherwise merge right
-			#check if the merging is possible
-			if end_to_check > peak_footprints[compared_footprint]['end'] and compared_footprint in peak_footprints_new.keys():
-				#update the end position
-				peak_footprints_new[compared_footprint]['end'] = end_to_check
-				#update the length
-				peak_footprints_new[compared_footprint]['len'] = end_to_check - peak_footprints[compared_footprint]['end']
-				#update the score
-				peak_footprints_new[compared_footprint]['score'] = (peak_footprints[footprint_to_check]['score'] + peak_footprints[compared_footprint]['score']) / 2
+		if merge_footprints_left: #the left merging is enabled, start and end of compared_footprint should be smaller than the start of the footprint_to_check
+			if start_to_check < peak_footprints[compared_footprint]['start']:
+				if footprint_to_check not in peak_footprints_new.keys():
+					if any(footprint_to_check in merged_footprints[x] for x in merged_footprints.keys()): #true if footprint_to_check was already merged with someone
+						for k, v in merged_footprints.items():
+							if footprint_to_check in v:
+								main_footprint = k
+						#make merging using the information from the merged_footprints and peak_footprints_new
+						#UPDATE
+						peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[compared_footprint]['start'], peak_footprints[main_footprint]['end'], peak_footprints[compared_footprint]['score'])
+						merged_array = merged_footprints[main_footprint]
+						merged_array.append(compared_footprint)
+						merged_footprints[main_footprint] = merged_array
+					#there are no merged footprints with the footprint_to_check yet, so make a new one
+					else:
+						#add the compared footprint and footprint_to_check to the merged_footprints
+						merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
+						merged_footprints[footprint_to_check] = [compared_footprint]
+
+						peak_footprints_new[footprint_to_check] = peak_footprints.get(footprint_to_check, {})
+						peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check] #<-- update
+						#UPDATE
+						peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
+				else: #the footprint_to_check is in peak_footprints_new already
+					#the footprint_to_check can only be the main part of merging before, check it
+					if footprint_to_check in merged_footprints.keys():
+						#footprint_to_check was as main for merging already
+						#UPDATE
+						peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
+						#add it to the merged_footprints as well
+						merged_array = merged_footprints[footprint_to_check]
+						merged_array.append(compared_footprint)
+						merged_footprints[footprint_to_check] = merged_array
+					else:
+						#the footprint_to check was not merged with anything yet
+						merged_footprints[footprint_to_check] = merged_footprints.get(footprint_to_check, [])
+						merged_footprints[footprint_to_check] = [compared_footprint]
+						#UPDATE
+						peak_footprints_new[footprint_to_check] = footprint_update(peak_footprints_new[footprint_to_check], peak_footprints_new[footprint_to_check]['start'], peak_footprints[compared_footprint]['end'], peak_footprints[compared_footprint]['score'])
+		#the right merging is enabled, start and end of compared footprint should be bigger than the start of the footprint_to_check
+		elif merge_footprints_left == False: 
+			if end_to_check > peak_footprints[compared_footprint]['end']:
+				if compared_footprint not in peak_footprints_new.keys():				
+					if any(compared_footprint in merged_footprints[x] for x in merged_footprints.keys()):
+						for k, v in merged_footprints.items():
+							if compared_footprint in v:
+								main_footprint = k
+						#make merging using the information from the merged_footprints and peak_footprints_new
+						#UPDATE
+						peak_footprints_new[main_footprint] = footprint_update(peak_footprints_new[main_footprint], peak_footprints[main_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
+						#add to the merged_footprints
+						merged_array = merged_footprints[main_footprint]
+						merged_array.append(footprint_to_check)
+						merged_footprints[main_footprint] = merged_array
+					else:
+						#make normal update, using data from peak footprints
+						merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
+						merged_footprints[compared_footprint] = [footprint_to_check]
+
+						peak_footprints_new[compared_footprint] = peak_footprints.get(compared_footprint, {})
+						peak_footprints_new[compared_footprint] = peak_footprints[compared_footprint]
+						#UPDATE
+						peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
+				else:
+					if compared_footprint in merged_footprints.keys():
+						#compared_footprint was as main for merging already
+						#UPDATE
+						peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
+
+						merged_array = merged_footprints[compared_footprint]
+						merged_array.append(footprint_to_check)
+						merged_footprints[compared_footprint] = merged_array
+
+					else:
+						#merge now and add compared_footprint to the merged_footprints
+						merged_footprints[compared_footprint] = merged_footprints.get(compared_footprint, [])
+						merged_footprints[compared_footprint] = [compared_footprint]
+						#UPDATE
+						peak_footprints_new[compared_footprint] = footprint_update(peak_footprints_new[compared_footprint], peak_footprints_new[compared_footprint]['start'], peak_footprints[footprint_to_check]['end'], peak_footprints[footprint_to_check]['score'])
 
 		else: #save the current footprint, as it should not be merged
 			peak_footprints_new[footprint_to_check] = peak_footprints_new.get(footprint_to_check, [])
 			peak_footprints_new[footprint_to_check] = peak_footprints[footprint_to_check]
 
 	return peak_footprints_new
 
+#this function is used to update the footprint that should to be merged with another one
+#as input the footprint, needed the update, as well as new start, new end and the score of the merged footprint are passed
+#the output of this function is a dictionary containing the new information about the footprint
+def footprint_update(footprint, start, end, score):
+	new_len = end - start
+	new_score = (footprint['score'] + score) / 2
+
+	footprint['start'] = start
+	footprint['end'] = end
+	footprint['score'] = new_score
+	footprint['len'] = new_len
+
+	return footprint
+
 #this function uses the information provided from the .bed file to look for footprints within the peaks of interest
 #as input the information from the original bed file, as well as bigwig file is needed
 #the optional parameters window_length, step and percentage are needed as well to use the sliding window algorithm and work with the "background" score
 #the output of this function is a dictionary contains all the found footprints ready to write to the output file
-def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, max_bp_between):
+def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage, min_gap):
 
 	logger.info('Looking for footprints within peaks...')
 
@@ -354,7 +437,7 @@ def find_peaks_from_bw(bed_dictionary, bw_file, window_length, step, percentage,
 			peak_footprints, footprint_count = search_in_window(peak_footprints, footprint_count, chromosom, peak_start, peak_end, scores_in_peak, window_length, bed_dictionary[header], step, percentage)
 
 			#double check for overlaps and possibly merging of footprints having up to 5 bp in between
-			peak_footprints = check_and_merge(peak_footprints, max_bp_between)
+			peak_footprints = check_and_merge(peak_footprints, min_gap)
 
 			for footprint_name in peak_footprints.keys():
 				all_footprints[footprint_name] = all_footprints.get(footprint_name, {})
@@ -398,7 +481,18 @@ def write_to_bed_file(all_footprints, sorted_output_file_name):
 
 	#write each footprint line for line to the output file
 	for footprint in all_footprints:
-		output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')
+		#validation of the footprints, if there is a problem with some of them, write which one it is
+		#first check the start and end positions
+		if footprint[1]['start'] >= footprint[1]['end']:
+			logger.info("The problem occured with start and end positions. This footprint will not be printed to the output file:")
+			logger.info(footprint)
+		#then check the max_pos
+		elif footprint[1]['max_pos'] == 0:
+			logger.info("The problem occured with max_pos of the footprint. This footprint will not be printed to the output file:")
+			logger.info(footprint)
+		#otherwise everything is fine, write to the output
+		else:
+			output_file.write('\t'.join([footprint[1]['chromosom'], str(footprint[1]['start']), str(footprint[1]['end']), footprint[0], str(round(footprint[1]['score'], 6)), '.', str(footprint[1]['len']), str(footprint[1]['max_pos']), ';'.join(footprint[1]['bonus'])]) + '\n')
 
 	output_file.close()
 
@@ -439,7 +533,7 @@ def main():
 	logger.info("The script footprints_extraction.py was called using these parameters: " + str(vars(args)))
 
 	bed_dictionary = make_bed_dictionary(args.bed)
-	all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.max_bp_between)
+	all_footprints = find_peaks_from_bw(bed_dictionary, args.bigwig, args.window_length, args.step, args.percentage, args.min_gap)
 	write_to_bed_file(all_footprints, args.output_file)
 
 	logger.info("the number of peaks: " + str(len(bed_dictionary)))

diff --git a/bin/1.2_filter_motifs/compareBed.sh b/bin/1.2_filter_motifs/compareBed.sh
@@ -176,7 +176,8 @@ then
 	echo "directory $workdir does not exist. Please check parameter -w / --workdir"
 	exit 1
 fi
-if [[ ${output: -4: -1} != '.bed' ]]
+# check if output path ends with .bed. If not, the filename is extended by ".bed"
+if [[ ${output: -4} != '.bed' ]]
 then
 	output=`echo $output | sed "s|$|.bed|g"`
 fi
@@ -275,20 +276,18 @@ first_line=`sed -n 1p $data | sed "s/$/\tcontains_maxpos\tsequence/"`
 if [[ ${first_line:0:1} == "#" ]]
 then
 	echo "$first_line" > $output
-	# add some final values to the log file
+	# add initial number of footprints to the log file
 	fp_initial=`cat $data | wc -l`
 	fp_initial=`expr $fp_initial - 1`
-	fp_final=`cat "$workdir"/filtered.bed | wc -l`
-	fp_final=`expr $fp_final - 1`
 	echo $fp_initial | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
-	echo $fp_final | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
 else
 	# output will be overwritten if it exists
 	rm -f $output
-	# add some final values to the log file
+	# add initial number of footprints to the log file
 	cat $data | wc -l | sed 's/^/initial number of footprints: /g' >> "$workdir"/compareBed.stats
-	cat "$workdir"/filtered.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
 fi
+# add number of footprints after filtering to the log file
+cat "$workdir"/filtered_flagged.bed | wc -l | sed 's/^/number of footprints after subtract: /g' >> "$workdir"/compareBed.stats
 
 # add fasta sequences to bed and create fasta file
 out_fasta=`echo $output | sed "s|.bed$|.fasta|g"`

diff --git a/bin/2.1_clustering/cdhit_wrapper.R b/bin/2.1_clustering/cdhit_wrapper.R
@@ -199,6 +199,11 @@ cdhitest <- function(input, identity = 0.8, coverage = 8, output = "cluster.bed"
     data.table::fwrite(x = cluster_table, file = summary, append = TRUE, sep = "\t", col.names = TRUE)
   }
 
+
+  # cast start and end column to integer64 to prevent scientific notation e.g. 1e+10
+  # start and end are assumed to be at position 2 and 3
+  result[, c(2, 3) := lapply(.SD, bit64::as.integer64), .SDcols = c(2, 3)]
+
   data.table::fwrite(x = result, file = output, sep = "\t", col.names = keep_col_names)
 }
 

diff --git a/bin/2.1_clustering/reduce_sequence.R b/bin/2.1_clustering/reduce_sequence.R
@@ -231,6 +231,10 @@ reduce_sequence <- function(input, kmer = 10, motif = 10, output = "reduced.bed"
     names(merged) <- col_names
   }
 
+  # cast start and end column to integer64 to prevent scientific notation e.g. 1e+10
+  # start and end are assumed to be at position 2 and 3
+  merged[, c(2, 3) := lapply(.SD, bit64::as.integer64), .SDcols = c(2, 3)]
+
   data.table::fwrite(merged, file = output, sep = "\t", col.names = keep_col_names)
 }
 

diff --git a/masterenv.yml b/masterenv.yml
@@ -23,3 +23,4 @@ dependencies:
   - matplotlib
   - seaborn
   - crossmap
+  - r-bit64
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,3 +23,4 @@ dependencies: @@
       - matplotlib
       - seaborn
       - crossmap
+      - r-bit64