Merge branch 'master' into network

loosolab · Aug 21, 2019 · 7375885 · 7375885
2 parents 02c98f8 + 906e919
commit 7375885
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 7 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,3 +1,6 @@
+## 0.7.0 (2019-08-13)
+- Updated the way p-values are calculated in BINDetect. Included 100-fold subsampling of background to estimate confidence interval.
+
 ## 0.6.4 (2019-08-07)
 - Bug fix for error with regions very close to chromosome borders for ATACorrect and ScoreBigwig. 
 

diff --git a/tobias/__init__.py b/tobias/__init__.py
@@ -1 +1 @@
-__version__ = "0.6.4"
+__version__ = "0.7.0"
diff --git a/tobias/footprinting/bindetect.py b/tobias/footprinting/bindetect.py
@@ -20,7 +20,7 @@
 import itertools
 import pandas as pd
 
-#Machine learning
+#Machine learning and statistics
 import sklearn
 from sklearn import mixture
 import scipy
@@ -78,6 +78,8 @@ def add_bindetect_arguments(parser):
 	#optargs.add_argument('--naming', metavar="<type>", help="Naming convention for TFs ('id', 'name', 'name_id', 'id_name') (default: 'name_id')", choices=["id", "name", "name_id", "id_name"], default="name_id")
 	optargs.add_argument('--motif-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for motif scanning (default: 1e-4)", default=0.0001)
 	optargs.add_argument('--bound-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001)
+	#optargs.add_argument('--volcano-diff-thresh', metavar="<float>", help="", default=0.2)	#not yet implemented
+	#optargs.add_argument('--volcano-p-thresh', metavar="<float>", help="", default=0.05)	#not yet implemented
 	optargs.add_argument('--pseudo', type=float, metavar="<float>", help="Pseudocount for calculating log2fcs (default: estimated from data)", default=None)
 	optargs.add_argument('--time-series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.")
 	optargs.add_argument('--skip-excel', action='store_true', help="Skip creation of excel files - for large datasets, this will speed up BINDetect considerably")

diff --git a/tobias/footprinting/bindetect_functions.py b/tobias/footprinting/bindetect_functions.py
@@ -393,23 +393,35 @@ def process_tfbs(TF_name, args, log2fc_params): 	#per tf
 		obs_mean, obs_std = obs_params
 		bg_mean, bg_std = bg_params
 		obs_no = np.min([len(observed_log2fcs), 50000])		#Set cap on obs_no to prevent super small p-values
+		n_obs = len(observed_log2fcs)
 
 		#If there was any change found at all (0 can happen if two bigwigs are the same)
 		if obs_mean != bg_mean: 
 			info_table.at[TF_name, base + "_change"] = (obs_mean - bg_mean) / np.mean([obs_std, bg_std])  #effect size
 			info_table.at[TF_name, base + "_change"] = np.round(info_table.at[TF_name, base + "_change"], 5)
 
 			#pval = scipy.stats.mannwhitneyu(observed_log2fcs, bg_log2fcs, alternative="two-sided")[1]
-			pval = scipy.stats.ttest_ind_from_stats(obs_mean, obs_std, obs_no, bg_mean, bg_std, obs_no, equal_var=False)[1] 	#pvalue is second in tup
-			info_table.at[TF_name, base + "_pvalue"] = pval
+			#pval = scipy.stats.ttest_ind_from_stats(obs_mean, obs_std, obs_no, bg_mean, bg_std, obs_no, equal_var=False)[1] 	#pvalue is second in tup
+			#info_table.at[TF_name, base + "_pvalue"] = pval
 
 		#Else not possible to compare groups
 		else:
 			info_table.at[TF_name, base + "_change"] = 0
 			info_table.at[TF_name, base + "_pvalue"] = 1
 
-		#stime_plot = datetime.now()
-
+		#Sample from background distribution
+		np.random.seed(n_obs)
+		sample_changes = []
+		for i in range(100):
+			sample = scipy.stats.norm.rvs(*log2fc_params[(cond1, cond2)], size=n_obs)	
+			sample_mean, sample_std = np.mean(sample), np.std(sample)
+			sample_change = (sample_mean - bg_mean) / np.mean([sample_std, bg_std])
+			sample_changes.append(sample_change)
+
+		#Estimate p-value by comparing sampling to observed mean
+		ttest = scipy.stats.ttest_1samp(sample_changes, info_table.at[TF_name, base + "_change"])
+		info_table.at[TF_name, base + "_pvalue"] = ttest[1]
+
 		#### Plot comparison ###
 		fig, ax = plt.subplots(1,1)
 		ax.hist(observed_log2fcs, bins='auto', label="Observed log2fcs", density=True)
@@ -616,7 +628,7 @@ def plot_bindetect(motifs, cluster_obj, conditions, args):
 		ax1.scatter(coord[0], coord[1], color=diff_scores[TF]["color"], s=4.5)
 
 		if diff_scores[TF]["show"] == True:
-			txts.append(ax1.text(coord[0], coord[1], diff_scores[TF]["volcano_label"], fontsize=8))
+			txts.append(ax1.text(coord[0], coord[1], diff_scores[TF]["volcano_label"], fontsize=9))
 
 	#Plot custom legend for colors
 	legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="Higher scores in {0}".format(conditions[0])),