diff --git a/setup.py b/setup.py index 02a3e73..f0103ed 100644 --- a/setup.py +++ b/setup.py @@ -88,7 +88,7 @@ def readme(): 'adjustText', 'pyBigWig', ], - + scripts = ["tobias/utils/filter_important_factors.py"], classifiers=[ 'License :: OSI Approved :: MIT License', 'Intended Audience :: Science/Research', diff --git a/tobias/footprinting/ATACorrect.py b/tobias/footprinting/ATACorrect.py index 91ba3d2..3dafff8 100644 --- a/tobias/footprinting/ATACorrect.py +++ b/tobias/footprinting/ATACorrect.py @@ -262,7 +262,6 @@ def run_atacorrect(args): #### Statistics about regions #### genome_bp = sum([region.get_length() for region in regions_dict["genome"]]) for key in regions_dict: - total_bp = sum([region.get_length() for region in regions_dict[key]]) logger.stats("{0}: {1} regions | {2} bp | {3:.2f}% coverage".format(key, len(regions_dict[key]), total_bp, total_bp/genome_bp*100)) @@ -272,6 +271,11 @@ def run_atacorrect(args): peak_regions = regions_dict["peak_regions"] nonpeak_regions = regions_dict["nonpeak_regions"] + #Exit if no input/output regions were found + if len(input_regions) == 0 or len(output_regions) == 0 or len(peak_regions) == 0 or len(nonpeak_regions) == 0: + logger.error("No regions found - exiting!") + sys.exit() + #----------------------------------------------------------------------------------------------------# # Estimate normalization factors #----------------------------------------------------------------------------------------------------# diff --git a/tobias/footprinting/BINDetect.py b/tobias/footprinting/BINDetect.py index 4e4fba3..a121556 100644 --- a/tobias/footprinting/BINDetect.py +++ b/tobias/footprinting/BINDetect.py @@ -43,12 +43,11 @@ from tobias.utils.motifs import * from tobias.utils.logger import * -#np.seterr(divide = 'ignore') - #For warnings from curve_fit import warnings from scipy.optimize import OptimizeWarning warnings.simplefilter("ignore", OptimizeWarning) +warnings.simplefilter("ignore", RuntimeWarning) #--------------------------------------------------------------------------------------------------------------# @@ -178,9 +177,9 @@ def run_bindetect(args): #output and order titles = [] if args.debug: - for cond in args.cond_names: - titles.append("Score distribution of {0} scores".format(cond)) - + + titles.append("Raw score distributions") + titles.append("Normalized score distributions") for (cond1, cond2) in comparisons: titles.append("Background log2FCs ({0} / {1})".format(cond1, cond2)) @@ -216,10 +215,10 @@ def run_bindetect(args): if len(args.peak_header_list) != peak_columns: logger.error("Length of --peak_header ({0}) does not fit number of columns in --peaks ({1}).".format(len(args.peak_header_list), peak_columns)) sys.exit() - else: - args.peak_header_list = None - + args.peak_header_list = ["peak_chr", "peak_start", "peak_end"] + ["additional_" + str(num + 1) for num in range(peak_columns-3)] + logger.debug("Peak header list: {0}".format(args.peak_header_list)) + ##### GC content for motif scanning ###### fasta_obj = pysam.FastaFile(args.genome) fasta_chroms = fasta_obj.references @@ -240,15 +239,11 @@ def run_bindetect(args): ################ Get motifs ################ - logger.info("Reading motifs from file") motif_content = open(args.motifs).read() converted_content = convert_motif(motif_content, "pfm") motif_list = pfm_to_motifs(converted_content) #List of OneMotif objects - - #motif_list = [motif.set_prefix(args.naming) for motif in motif_list] - #motif_prefix_list = [motif.prefix for motif in motif_list] no_pfms = len(motif_list) logger.info("- Found {0} motifs in file".format(no_pfms)) @@ -289,10 +284,8 @@ def run_bindetect(args): #-------------------------------------------------------------------------------------------------------------# logger.comment("") - logger.start_logger_queue() #start process for listening and handling through the main logger queue args.log_q = logger.queue #queue for multiprocessing logging - manager = mp.Manager() logger.info("Scanning for motifs and matching to signals...") @@ -301,7 +294,9 @@ def run_bindetect(args): qs_list = [] writer_qs = {} - #create_writer_queue(key2file, writer_cores) + #writer_queue = create_writer_queue(key2file, writer_cores) + #writer_queue.stop() #wait until all are done + manager = mp.Manager() TF_names_chunks = [motif_names[i::writer_cores] for i in range(writer_cores)] for TF_names_sub in TF_names_chunks: @@ -361,82 +356,83 @@ def run_bindetect(args): for bigwig in args.cond_names: background["signal"][bigwig] = np.array(background["signal"][bigwig]) - - ###### Estimate score distribution to define bound/unbound threshold per condition ###### logger.comment("") - logger.info("Estimating score distributions per condition") - args.thresholds = {} - pseudos = [] - figures = [] #save figures before saving to file to unify x-ranges - gmms = {} - all_log_params = {} + logger.info("Estimating score distribution per condition") - for bigwig in args.cond_names: + if args.debug == True: + fig = plot_score_distribution([background["signal"][bigwig] for bigwig in args.cond_names], labels=args.cond_names, title="Raw scores per condition") + figure_pdf.savefig(fig, bbox_inches='tight') + plt.close() - #Prepare scores (remove 0's etc.) - bg_values = np.copy(background["signal"][bigwig]) - logger.debug("{0} scores for bigwig {1}".format(len(bg_values), bigwig)) + logger.info("Normalizing scores") + list_of_vals = [background["signal"][bigwig] for bigwig in args.cond_names] + normed, norm_objects = quantile_normalization(list_of_vals) - bg_values = bg_values[np.logical_not(np.isclose(bg_values, 0.0))] #only non-zero counts - x_max = np.percentile(bg_values, [99]) - bg_values = bg_values[bg_values < x_max] + args.norm_objects = dict(zip(args.cond_names, norm_objects)) + for bigwig in args.cond_names: + background["signal"][bigwig] = args.norm_objects[bigwig].normalize(background["signal"][bigwig]) + + if args.debug == True: + fig = plot_score_distribution([background["signal"][bigwig] for bigwig in args.cond_names], labels=args.cond_names, title="Normalized scores per condition") + figure_pdf.savefig(fig, bbox_inches='tight') + plt.close() + + ########################################################### + logger.info("Estimating bound/unbound threshold") + + #Prepare scores (remove 0's etc.) + bg_values = np.array(normed).flatten() + bg_values = bg_values[np.logical_not(np.isclose(bg_values, 0.0))] #only non-zero counts + x_max = np.percentile(bg_values, [99]) + bg_values = bg_values[bg_values < x_max] - #Fit mixture of normals - lowest_bic = np.inf - for n_components in [2]: #2 components - gmm = sklearn.mixture.GaussianMixture(n_components=n_components, random_state=1) - gmm.fit(np.log(bg_values).reshape(-1, 1)) - - bic = gmm.bic(np.log(bg_values).reshape(-1,1)) - logger.debug("n_compontents: {0} | bic: {1}".format(n_components, bic)) - if bic < lowest_bic: - lowest_bic = bic - best_gmm = gmm - gmm = best_gmm - gmms[bigwig] = best_gmm - - #Extract most-right gaussian - means = gmm.means_.flatten() - sds = np.sqrt(gmm.covariances_).flatten() - chosen_i = np.argmax(means) #Mixture with largest mean - log_params = scipy.stats.lognorm.fit(bg_values[bg_values < x_max], f0=sds[chosen_i], fscale=np.exp(means[chosen_i])) - all_log_params[bigwig] = log_params - - #Plot mixture - #if args.debug: - # plt.hist(np.log(bg_values), bins='auto', density=True) - # xlim = plt.xlim() - # x = np.linspace(xlim[0], xlim[1], 1000) - # for i in range(2): - # pdf = scipy.stats.norm.pdf(x, means[i], sds[i]) - # plt.plot(x, pdf) - # - # logprob = gmm.score_samples(x.reshape(-1, 1)) - # df = np.exp(logprob) - # plt.plot(x, df) - # plt.show() - - #Mode of distribution - mode = scipy.optimize.fmin(lambda x: -scipy.stats.lognorm.pdf(x, *log_params), 0, disp=False)[0] - logger.debug("- Mode for condition {0} estimated at: {1}".format(bigwig, mode)) - pseudo = mode / 2.0 #pseudo is half the mode - pseudos.append(pseudo) + #Fit mixture of normals + lowest_bic = np.inf + for n_components in [2]: #2 components + gmm = sklearn.mixture.GaussianMixture(n_components=n_components, random_state=1) + gmm.fit(np.log(bg_values).reshape(-1, 1)) - # Estimate theoretical normal for threshold - leftside_x = np.linspace(scipy.stats.lognorm(*log_params).ppf([0.01]), mode, 100) - leftside_pdf = scipy.stats.lognorm.pdf(leftside_x, *log_params) - - #Flip over - mirrored_x = np.concatenate([leftside_x, np.max(leftside_x) + leftside_x]).flatten() - mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]).flatten() - popt, cov = curve_fit(lambda x, std, sc: sc * scipy.stats.norm.pdf(x, mode, std), mirrored_x, mirrored_pdf) - norm_params = (mode, popt[0]) - logger.debug("Theoretical normal parameters: {0}".format(norm_params)) - - #Set threshold for bound/unbound - threshold = round(scipy.stats.norm.ppf(1-args.bound_pvalue, *norm_params), 5) - args.thresholds[bigwig] = threshold - logger.stats("- Threshold for condition {0} estimated at: {1}".format(bigwig, threshold)) + bic = gmm.bic(np.log(bg_values).reshape(-1,1)) + logger.debug("n_compontents: {0} | bic: {1}".format(n_components, bic)) + if bic < lowest_bic: + lowest_bic = bic + best_gmm = gmm + gmm = best_gmm + + #Extract most-right gaussian + means = gmm.means_.flatten() + sds = np.sqrt(gmm.covariances_).flatten() + chosen_i = np.argmax(means) #Mixture with largest mean + + log_params = scipy.stats.lognorm.fit(bg_values[bg_values < x_max], f0=sds[chosen_i], fscale=np.exp(means[chosen_i])) + #all_log_params[bigwig] = log_params + + #Mode of distribution + mode = scipy.optimize.fmin(lambda x: -scipy.stats.lognorm.pdf(x, *log_params), 0, disp=False)[0] + logger.debug("- Mode estimated at: {0}".format(mode)) + pseudo = mode / 2.0 #pseudo is half the mode + args.pseudo = pseudo #. append(pseudo) + logger.debug("Pseudocount estimated at: {0}".format(round(args.pseudo, 5))) + + # Estimate theoretical normal for threshold + leftside_x = np.linspace(scipy.stats.lognorm(*log_params).ppf([0.01]), mode, 100) + leftside_pdf = scipy.stats.lognorm.pdf(leftside_x, *log_params) + + #Flip over + mirrored_x = np.concatenate([leftside_x, np.max(leftside_x) + leftside_x]).flatten() + mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]).flatten() + popt, cov = curve_fit(lambda x, std, sc: sc * scipy.stats.norm.pdf(x, mode, std), mirrored_x, mirrored_pdf) + norm_params = (mode, popt[0]) + logger.debug("Theoretical normal parameters: {0}".format(norm_params)) + + #Set threshold for bound/unbound + threshold = round(scipy.stats.norm.ppf(1-args.bound_pvalue, *norm_params), 5) + + args.thresholds = {bigwig: threshold for bigwig in args.cond_names} + logger.stats("- Threshold estimated at: {0}".format( threshold)) + + #Only plot if args.debug is True + if args.debug: #Plot fit fig, ax = plt.subplots(1, 1) @@ -455,50 +451,20 @@ def run_bindetect(args): ax.text(threshold, ymax, "\n {0:.3f}".format(threshold), va="top") #Decorate plot - plt.title("Score distribution of \"{0}\" scores".format(bigwig)) + plt.title("Score distribution") plt.xlabel("Bigwig score") plt.ylabel("Density") plt.legend(fontsize=8) plt.xlim((0,x_max)) - figures.append((ax, fig)) - - + figure_pdf.savefig(fig) + plt.close(fig) - #Quantile normalization - ref = args.cond_names[0] - for cond in args.cond_names[1:]: - - ref_quantiles = scipy.stats.lognorm(5, *all_log_params[ref]) - cond_quantiles = scipy.stats.lognorm(5, *all_log_params[cond]) - - print(ref_quantiles) - print(cond_quantiles) - - plt.plot(ref_quantiles, cond_quantiles) - #all_log_params[bigwig] - - - #Only plot if args.debug is True - if args.debug: - #Set x-max of all plots equal - xlim = np.min([ax.get_xlim()[1] for ax, fig in figures]) - for (ax, fig) in figures: - ax.set_xlim(0,xlim) - figure_pdf.savefig(fig) - plt.close(fig) - - #Estimate pseudocount - if args.pseudo == None: - args.pseudo = np.mean(pseudo) - logger.debug("Pseudocount estimated at: {0}".format(round(args.pseudo, 5))) - - ############ Foldchanges between conditions ################ logger.comment("") log2fc_params = {} if len(args.signals) > 1: - logger.info("Calculating log2 fold changes between conditions") + logger.info("Calculating background log2 fold-changes between conditions") for (bigwig1, bigwig2) in comparisons: #cond1, cond2 logger.info("- {0} / {1}".format(bigwig1, bigwig2)) @@ -529,7 +495,7 @@ def run_bindetect(args): xvals = np.linspace(plt.xlim()[0], plt.xlim()[1], 100) pdf = scipy.stats.norm.pdf(xvals, *log2fc_params[(bigwig1, bigwig2)]) plt.plot(xvals, pdf, label="Normal distribution fit") - plt.title("Background log2FCs ({0} / {1})".format(cond1, cond2)) + plt.title("Background log2FCs ({0} / {1})".format(bigwig1, bigwig2)) plt.xlabel("Log2 fold change") plt.ylabel("Density") @@ -545,7 +511,7 @@ def run_bindetect(args): logger.comment("") logger.info("Processing scanned TFBS individually") - + #Getting bindetect table ready info_columns = ["total_tfbs"] info_columns.extend(["{0}_{1}".format(cond, metric) for (cond, metric) in itertools.product(args.cond_names, ["threshold", "bound"])]) @@ -595,7 +561,7 @@ def run_bindetect(args): logger.comment("") logger.info("Writing all_bindetect files") - + #Add columns of name / motif_id / prefix names = [] ids = [] @@ -604,14 +570,9 @@ def run_bindetect(args): names.append(motif[0].name) ids.append(motif[0].id) - info_table.insert(0, "name", names) - info_table.insert(1, "motif_id", ids) - info_table.insert(2, "output_prefix", info_table.index) - - #Condition specific - info_table["total_tfbs"] = info_table["total_tfbs"].map(int) - for condition in args.cond_names: - info_table[condition + "_bound"] = info_table[condition + "_bound"].map(int) + info_table.insert(0, "output_prefix", info_table.index) + info_table.insert(1, "name", names) + info_table.insert(2, "motif_id", ids) #Add cluster to info_table cluster_names = [] @@ -621,14 +582,29 @@ def run_bindetect(args): cluster_names.append(clustering.clusters[cluster]["cluster_name"]) info_table.insert(3, "cluster", cluster_names) + #Cluster table on motif clusters + info_table_clustered = info_table.groupby("cluster").mean() #mean of each column + info_table_clustered.reset_index(inplace=True) + + #Map correct type + info_table["total_tfbs"] = info_table["total_tfbs"].map(int) + for condition in args.cond_names: + info_table[condition + "_bound"] = info_table[condition + "_bound"].map(int) + #### Write excel ### bindetect_excel = os.path.join(args.outdir, args.prefix + "_results.xlsx") writer = pd.ExcelWriter(bindetect_excel, engine='xlsxwriter') - info_table.to_excel(writer, index=False) - - worksheet = writer.sheets['Sheet1'] - no_rows, no_cols = info_table.shape - worksheet.autofilter(0,0,no_rows,no_cols) + + #Tables + info_table.to_excel(writer, index=False, sheet_name="Individual motifs") + info_table_clustered.to_excel(writer, index=False, sheet_name="Motif clusters") + + for sheet in writer.sheets: + worksheet = writer.sheets[sheet] + n_rows = worksheet.dim_rowmax + n_cols = worksheet.dim_colmax + worksheet.autofilter(0,0,n_rows,n_cols) + writer.save() #Format comparisons diff --git a/tobias/footprinting/BINDetect_functions.py b/tobias/footprinting/BINDetect_functions.py index 812985d..8f265f1 100644 --- a/tobias/footprinting/BINDetect_functions.py +++ b/tobias/footprinting/BINDetect_functions.py @@ -26,6 +26,7 @@ from cycler import cycler from matplotlib.lines import Line2D from adjustText import adjust_text +from scipy.optimize import curve_fit #Bio-specific packages import pyBigWig @@ -42,11 +43,108 @@ from tobias.utils.signals import * import warnings +#np.seterr(all='raise') #------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# +def sigmoid(x, a, b, L, shift): + """ a is the xvalue at the sigmoid midpoint """ + + y = L / (1 + np.exp(-b*(x-a))) + shift + return y + +class ArrayNorm: + + def __init__(self, popt): + self.popt = popt + + def normalize(self, arr): + return(arr * sigmoid(arr, *self.popt)) + +def dict_to_tab(dict_list, fname, chosen_columns, header=False): + + #Establish header + if header == True: + out_str = "\t".join(chosen_columns) + "\n" + else: + out_str = "" + + #Add lines + out_str += "\n".join(["\t".join([str(line_dict[column]) for column in chosen_columns]) for line_dict in dict_list]) + "\n" + + #Write file + f = open(fname, "w") + f.write(out_str) + f.close() + +#Quantile normalization +def quantile_normalization(list_of_arrays): #lists paired values to normalize + + n = len(list_of_arrays) #number of arrays to normalize + + #Calculate + quantiles = np.linspace(0.2,0.999,500) + array_quantiles = [np.quantile(arr[arr > 0], quantiles) for arr in list_of_arrays] + mean_array_quantiles = [np.mean([array_quantiles[i][j] for i in range(n)]) for j in range(len(quantiles))] + + norm_objects = [] + for i in range(n): + + #Plot q-q + #f, ax = plt.subplots() + #plt.scatter(array_quantiles[i], mean_array_quantiles) + #ax.set_xlim(0,1) + #ax.set_ylim(0,1) + #ax.plot([0, 1], [0, 1], transform=ax.transAxes) + #plt.close() + #plt.show() + + #Plot normalizyation factors + #f, ax = plt.subplots() + xdata = array_quantiles[i] + ydata = mean_array_quantiles/xdata + #plt.scatter(xdata, ydata) + #plt.close() + + popt, pcov = curve_fit(sigmoid, xdata, ydata, bounds=((0,-np.inf, 0, 0), (np.inf, np.inf, np.inf, np.inf))) + norm_objects.append(ArrayNorm(popt)) + + y_est = sigmoid(xdata, *popt) + plt.plot(xdata, y_est) + + plt.close() + + #Normalize each array using means per rank + normed = [] + for i in range(n): #for each array + normed.append(norm_objects[i].normalize(list_of_arrays[i])) + + return((normed, norm_objects)) + +def plot_score_distribution(list_of_arr, labels=[], title="Score distribution"): + + fig, ax = plt.subplots(1, 1) + xlim = [] + for i in range(len(list_of_arr)): + + values = np.array(list_of_arr[i]) + x_max = np.percentile(values, [99]) + values = values[values < x_max] + xlim.append(x_max) + + plt.hist(values, bins=100, alpha=.4, density=True, label=labels[i]) + + ax.set_xlabel("Scores") + ax.set_ylabel("Density") + ax.set_xlim(0, min(xlim)) + plt.legend() + plt.title(title) + + return(fig) + + def get_gc_content(regions, fasta): """ Get GC content from regions in fasta """ nuc_count = {"T":0, "t":0, "A":0, "a":0, "G":1, "g":1, "C":1, "c":1} @@ -67,7 +165,6 @@ def get_gc_content(regions, fasta): #---------------------------------------------------------------------------------------------------------# #------------------------------------------- Main functions ----------------------------------------------# #---------------------------------------------------------------------------------------------------------# - def scan_and_score(regions, motifs_obj, args, log_q, qs): """ Scanning and scoring runs in parallel for subsets of regions """ @@ -172,90 +269,100 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): def process_tfbs(TF_name, args, log2fc_params): #per tf """ Processes single TFBS to split into bound/unbound and write out overview file """ - begin_time = datetime.now() + #begin_time = datetime.now() logger = TobiasLogger("", args.verbosity, args.log_q) #sending all logger calls to log_q - #pre-scanned sites to read + #Pre-scanned sites to read bed_outdir = os.path.join(args.outdir, TF_name, "beds") filename = os.path.join(bed_outdir, TF_name + ".tmp") no_cond = len(args.cond_names) comparisons = args.comparisons - #Get info table ready - info_columns = ["total_tfbs"] - info_columns.extend(["{0}_{1}".format(cond, metric) for (cond, metric) in itertools.product(args.cond_names, ["mean_score", "bound"])]) - info_columns.extend(["{0}_{1}_{2}".format(comparison[0], comparison[1], metric) for (comparison, metric) in itertools.product(comparisons, ["change", "pvalue"])]) - rows, cols = 1, len(info_columns) - info_table = pd.DataFrame(np.zeros((rows, cols)), columns=info_columns, index=[TF_name]) - - #Read file to pandas - arr = np.genfromtxt(filename, dtype=None, delimiter="\t", names=None, encoding="utf8", autostrip=True) #Read using genfromtxt to get automatic type - bed_table = pd.DataFrame(arr, index=None, columns=None) - no_rows, no_cols = bed_table.shape - - #no_rows, no_cols = overview_table.shape - info_table.at[TF_name, "total_tfbs"] = no_rows + #Read file to list of dicts + header = ["TFBS_chr", "TFBS_start", "TFBS_end", "TFBS_name", "TFBS_score", "TFBS_strand"] + args.peak_header_list + ["{0}_score".format(condition) for condition in args.cond_names] + with open(filename) as f: + bedlines = [dict(zip(header, line.rstrip().split("\t"))) for line in f.readlines()] + n_rows = len(bedlines) - #Set header in overview - header = [""]*no_cols - header[:6] = ["TFBS_chr", "TFBS_start", "TFBS_end", "TFBS_name", "TFBS_score", "TFBS_strand"] + ############################## Local effects ############################### - if args.peak_header_list != None: - header[6:6+len(args.peak_header_list)] = args.peak_header_list - else: - no_peak_col = len(header[6:]) - header[6:6+no_peak_col] = ["peak_chr", "peak_start", "peak_end"] + ["additional_" + str(num + 1) for num in range(no_peak_col-3)] - - header[-no_cond:] = ["{0}_score".format(condition) for condition in args.cond_names] #signal scores - bed_table.columns = header + #Sort, scale and calculate log2fc + bedlines = sorted(bedlines, key=lambda line: (line["TFBS_chr"], int(line["TFBS_start"]), int(line["TFBS_end"]))) + for line in bedlines: + + #Condition specific + for condition in args.cond_names: + threshold = args.thresholds[condition] + line[condition + "_score"] = float(line[condition + "_score"]) + line[condition + "_score"] = np.round(args.norm_objects[condition].normalize(line[condition + "_score"] ), 5) + line[condition + "_bound"] = 1 if line[condition + "_score"] > threshold else 0 - #Sort and format - bed_table = bed_table.sort_values(["TFBS_chr", "TFBS_start", "TFBS_end"]) - for condition in args.cond_names: - bed_table[condition + "_score"] = bed_table[condition + "_score"].round(5) + #Comparison specific + for i, (cond1, cond2) in enumerate(comparisons): + base = "{0}_{1}".format(cond1, cond2) + line[base + "_log2fc"] = round(np.log2((line[cond1 + "_score"] + args.pseudo) / (line[cond2 + "_score"] + args.pseudo)), 5) #### Write _all file #### - chosen_columns = [col for col in header] outfile = os.path.join(bed_outdir, TF_name + "_all.bed") - bed_table.to_csv(outfile, sep="\t", index=False, header=False, columns=chosen_columns) + dict_to_tab(bedlines, outfile, header) - #### Estimate bound/unbound split #### + #### Write _bound/_unbound files #### for condition in args.cond_names: + chosen_columns = header[:-no_cond-1] + [condition + "_score"] + + #Subset bedlines per state + for state in ["bound", "unbound"]: + outfile = os.path.join(bed_outdir, "{0}_{1}_{2}.bed".format(TF_name, condition, state)) + chosen_bool = 1 if state == "bound" else 0 + bedlines_subset = [bedline for bedline in bedlines if bedline[condition + "_bound"] == chosen_bool] + bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True) + dict_to_tab(bedlines_subset, outfile, chosen_columns) - threshold = args.thresholds[condition] - bed_table[condition + "_bound"] = np.where(bed_table[condition + "_score"] > threshold, 1, 0).astype(int) + ##### Write overview with scores, bound and log2fcs #### + overview_columns = header + [condition + "_bound" for condition in args.cond_names] + ["{0}_{1}_log2fc".format(cond1, cond2) for (cond1, cond2) in comparisons] + overview_txt = os.path.join(args.outdir, TF_name, TF_name + "_overview.txt") + dict_to_tab(bedlines, overview_txt, overview_columns, header=True) + + #Write xlsx overview + bed_table = pd.DataFrame(bedlines) + try: + overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx") + writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter') + bed_table.to_excel(writer, index=False, columns=overview_columns) - info_table.at[TF_name, condition + "_mean_score"] = round(np.mean(bed_table[condition + "_score"]), 5) - info_table.at[TF_name, condition + "_bound"] = np.sum(bed_table[condition + "_bound"].values) #_bound contains bool 0/1 + worksheet = writer.sheets['Sheet1'] + no_rows, no_cols = bed_table.shape + worksheet.autofilter(0,0,no_rows, no_cols-1) + writer.save() - #Write bound/unbound - for (condition, state) in itertools.product(args.cond_names, ["bound", "unbound"]): + except: + print("Error writing excelfile for TF {0}".format(TF_name)) + sys.exit() - outfile = os.path.join(bed_outdir, "{0}_{1}_{2}.bed".format(TF_name, condition, state)) - #Subset bed table - chosen_bool = 1 if state == "bound" else 0 - bed_table_subset = bed_table.loc[bed_table[condition + "_bound"] == chosen_bool] - bed_table_subset = bed_table_subset.sort_values([condition + "_score"], ascending=False) + ############################## Global effects ############################## - #Write out subset with subset of columns - chosen_columns = header[:-no_cond-1] + [condition + "_score"] - bed_table_subset.to_csv(outfile, sep="\t", index=False, header=False, columns=chosen_columns) + #Get info table ready + info_columns = ["total_tfbs"] + info_columns.extend(["{0}_{1}".format(cond, metric) for (cond, metric) in itertools.product(args.cond_names, ["mean_score", "bound"])]) + info_columns.extend(["{0}_{1}_{2}".format(comparison[0], comparison[1], metric) for (comparison, metric) in itertools.product(comparisons, ["change", "pvalue"])]) + rows, cols = 1, len(info_columns) + info_table = pd.DataFrame(np.zeros((rows, cols)), columns=info_columns, index=[TF_name]) + #Fill in info table + info_table.at[TF_name, "total_tfbs"] = n_rows - #### Calculate statistical test in comparison to background #### + for condition in args.cond_names: + info_table.at[TF_name, condition + "_mean_score"] = round(np.mean(bed_table[condition + "_score"]), 5) + info_table.at[TF_name, condition + "_bound"] = np.sum(bed_table[condition + "_bound"].values) #_bound contains bool 0/1 + + #### Calculate statistical test for binding in comparison to background #### fig_out = os.path.abspath(os.path.join(args.outdir, TF_name, "plots", TF_name + "_log2fcs.pdf")) log2fc_pdf = PdfPages(fig_out, keep_empty=True) for i, (cond1, cond2) in enumerate(comparisons): base = "{0}_{1}".format(cond1, cond2) - #Calculate log2fcs of TFBS for this TF - cond1_values = bed_table[cond1 + "_score"].values - cond2_values = bed_table[cond2 + "_score"].values - bed_table[base + "_log2fc"] = np.log2(np.true_divide(cond1_values + args.pseudo, cond2_values + args.pseudo)) - bed_table[base + "_log2fc"] = bed_table[base + "_log2fc"].round(5) - # Compare log2fcs to background log2fcs included = np.logical_or(bed_table[cond1 + "_score"].values > 0, bed_table[cond2 + "_score"].values > 0) subset = bed_table[included].copy() #included subset @@ -317,29 +424,8 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf plt.close(fig) log2fc_pdf.close() - - - ########## Write overview with scores, bound and log2fcs ############## - chosen_columns = [col for col in bed_table.columns if col != "GC"] - overview_txt = os.path.join(args.outdir, TF_name, TF_name + "_overview.txt") - bed_table.to_csv(overview_txt, sep="\t", index=False, header=True, columns=chosen_columns) - - #Write xlsx overview - try: - overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx") - writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter') - bed_table.to_excel(writer, index=False, columns=chosen_columns) - - worksheet = writer.sheets['Sheet1'] - no_rows, no_cols = bed_table.shape - worksheet.autofilter(0,0,no_rows, no_cols-1) - writer.save() - - except: - print("Error writing excelfile for TF {0}".format(TF_name)) - sys.exit() #logger.critical("Error writing excelfile for TF {0}".format(TF_name) - #### Remove temporary file #### + #################### Remove temporary file ###################### try: os.remove(filename) except: @@ -348,7 +434,6 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf return(info_table) - #------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# diff --git a/tobias/motifs/tfbscan.py b/tobias/motifs/tfbscan.py index 912b121..7170df2 100644 --- a/tobias/motifs/tfbscan.py +++ b/tobias/motifs/tfbscan.py @@ -174,7 +174,11 @@ def run_tfbscan(args): #Clip regions at chromosome boundaries regions = regions.apply_method(OneRegion.check_boundary, fasta_chrom_info, "cut") + if len(regions) == 0: + logger.error("No regions found.") + sys.exit() logger.info("- Total of {0} regions (after splitting)".format(len(regions))) + #Background gc if args.gc == None: diff --git a/tobias/plotting/plot_aggregate.py b/tobias/plotting/plot_aggregate.py index 4a1342d..15a7f2f 100644 --- a/tobias/plotting/plot_aggregate.py +++ b/tobias/plotting/plot_aggregate.py @@ -216,6 +216,8 @@ def run_aggregate(args): pybw.close() + + ######################################################################################### ################################## Calculate aggregates ################################# @@ -234,11 +236,11 @@ def run_aggregate(args): #Exclude outliers from each column lower_limit, upper_limit = -np.inf, np.inf - #lower_limit, upper_limit = np.percentile(signalmat[signalmat != 0], [1,99]) - - logger.debug("Lower limits: {0}".format(lower_limit)) - logger.debug("Upper limits: {0}".format(upper_limit)) - signalmat[np.logical_or(signalmat < lower_limit, signalmat > upper_limit)] = np.nan + #lower_limit, upper_limit = np.percentile(signalmat[signalmat != 0], [0.5,99.5]) + logical = np.logical_and(np.min(signalmat, axis=1) > lower_limit, np.max(signalmat, axis=1) < upper_limit) + #logger.debug("Lower limits: {0}".format(lower_limit)) + #logger.debug("Upper limits: {0}".format(upper_limit)) + signalmat = signalmat[logical] if args.log_transform: signalmat_abs = np.abs(signalmat) diff --git a/tobias/plotting/plot_changes.py b/tobias/plotting/plot_changes.py index 69a8790..4f709f0 100644 --- a/tobias/plotting/plot_changes.py +++ b/tobias/plotting/plot_changes.py @@ -16,6 +16,7 @@ import matplotlib.pyplot as plt import numpy as np import itertools +from matplotlib.backends.backend_pdf import PdfPages from tobias.utils.utilities import * from tobias.utils.logger import * @@ -33,10 +34,10 @@ def add_plotchanges_arguments(parser): required_arguments = parser.add_argument_group('Required arguments') required_arguments.add_argument('--bindetect', metavar="", help='Bindetect_results.txt file from BINDetect run') - required_arguments.add_argument('--TFS', metavar="", help='Text file containing names of TFs to show in plot (one per line)') - + #All other arguments are optional optional_arguments = parser.add_argument_group('Optional arguments') + optional_arguments.add_argument('--TFS', metavar="", help='Text file containing names of TFs to show in plot (one per line)') optional_arguments.add_argument('--output', metavar="", help='Output file for plot (default: bindetect_changes.pdf)', default="bindetect_changes.pdf") optional_arguments.add_argument('--conditions', metavar="", help="Ordered list of conditions to show (default: conditions are ordered as within the bindetect file)", nargs="*") optional_arguments = add_logger_args(optional_arguments) @@ -51,37 +52,45 @@ def run_plotchanges(args): logger = TobiasLogger("PlotChanges", args.verbosity) logger.begin() - check_required(args, ["bindetect", "TFS"]) + check_required(args, ["bindetect"]) check_files([args.bindetect, args.TFS], "r") check_files([args.output], "w") + #------------------------------------ Read data ------------------------------------# logger.info("Reading data from bindetect file") # Read in bindetect file - table = pd.read_csv(args.bindetect, sep="\t", index_col=0) - all_TFS = list(table.index) + bindetect = pd.read_csv(args.bindetect, sep="\t") + bindetect.set_index("output_prefix", inplace=True, drop=False) + + all_TFS = list(bindetect["output_prefix"]) logger.info("{0} TFS found in bindetect file".format(len(all_TFS))) #Read in TF names from --TFS: - given_TFS = open(args.TFS, "r").read().split() - logger.info("TFS given in --TFS: {0}".format(given_TFS)) + if args.TFS != None: + + given_TFS = open(args.TFS, "r").read().split() + logger.info("TFS given in --TFS: {0}".format(given_TFS)) - #Find matches between all and given - logger.info("Matching given TFs with bindetect file...") - lofl = [given_TFS, all_TFS] - matches = match_lists(lofl) + #Find matches between all and given + logger.info("Matching given TFs with bindetect file...") + lofl = [given_TFS, all_TFS] + matches = match_lists(lofl) - for i, TF in enumerate(given_TFS): - logger.info("- {0} matched with: {1}".format(TF, matches[0][i])) - - #Get tfs - chosen_TFS = list(flatten_list(matches)) - logger.info("Chosen TFS to view in plot: {0}".format(chosen_TFS)) + for i, TF in enumerate(given_TFS): + logger.info("- {0} matched with: {1}".format(TF, matches[0][i])) + + #Get tfs + chosen_TFS = list(set(flatten_list(matches))) + logger.info("Chosen TFS to view in plot: {0}".format(chosen_TFS)) + else: + logger.info("Showing all TFS in plot. Please use --TFS to subset output.") + chosen_TFS = all_TFS # Get order of conditions - header = list(table.columns.values) + header = list(bindetect.columns.values) conditions_file = [element.replace("_bound", "") for element in header if "bound" in element] if args.conditions == None: args.conditions = conditions_file @@ -90,23 +99,68 @@ def run_plotchanges(args): logger.info("ERROR: --conditions {0} is not a subset of bindetect conditions ({1})".format(args.conditions, conditions_file)) sys.exit() - #condition_comparison = list(itertools.combinations(args.conditions, 2)) logger.info("Conditions in order: {0}".format(args.conditions)) - - #------------------------------------ Make plot --------------------------------# + #------------------------------------ Make plots --------------------------------# logger.info("Plotting figure") - cmap = matplotlib.cm.get_cmap('rainbow') - colors = cmap(np.linspace(0,1,len(chosen_TFS))) - fig, ax1 = plt.subplots(figsize=(10,5)) + fig_out = os.path.abspath(args.output) + figure_pdf = PdfPages(fig_out, keep_empty=True) + + #Changes over time for different measures + for cluster_flag in [False, True]: + #logger.info("- Use clusters: {0}".format(cluster_flag)) + + #Choose whether to show individual TFs or clusters + if cluster_flag == True: + table = bindetect.loc[chosen_TFS,].groupby("cluster").mean() #mean of each column + else: + table = bindetect.loc[chosen_TFS] + + #Get colors ready + cmap = matplotlib.cm.get_cmap('rainbow') + colors = cmap(np.linspace(0,1,len(table))) + + xvals = np.arange(0,len(args.conditions)) + for measure in ["n_bound", "percent_bound", "mean_score"]: + #logger.info("-- {0}".format(measure)) + fig, ax = plt.subplots(figsize=(10,5)) + for i, TF in enumerate(table.index): + + if measure == "n_bound": + yvals = np.array([table.at[TF, "{0}_bound".format(cond)] for cond in args.conditions]) + elif measure == "percent_bound": + n_bound = np.array([table.at[TF, "{0}_bound".format(cond)] for cond in args.conditions]) + yvals = n_bound / table.at[TF, "total_tfbs"] * 100.0 #percent bound + elif measure == "mean_score": + yvals = np.array([table.at[TF, "{0}_mean_score".format(cond)] for cond in args.conditions]) + + ax.plot(xvals, yvals, color=colors[i], marker="o", label=TF) + ax.annotate(TF, (xvals[0]-0.1, yvals[0]), color=colors[i], horizontalalignment="right", verticalalignment="center", fontsize=6) + + #General + plt.title("Changes in TF binding across conditions") + plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=3, markerscale=0.5) + plt.xticks(xvals, args.conditions) + plt.xlabel("Conditions") + + if measure == "n_bound": + plt.ylabel("Number of sites predicted bound", color="black") + elif measure == "percent_bound": + plt.ylabel("Percent of sites predicted bound", color="black") + elif measure == "mean_score": + plt.ylabel("Mean binding score", color="black") + ax.tick_params('y', colors='black') + + plt.xlim(xvals[0]-2, xvals[-1]+0.5) + figure_pdf.savefig(fig, bbox_inches='tight') + plt.close() - #Make lineplot per TF - for i, TF in enumerate(chosen_TFS): - no_bound = np.array([table.at[TF, "{0}_bound".format(cond)] for cond in args.conditions]) - """ + #Change between conditions + #condition_comparison = list(itertools.combinations(args.conditions, 2)) + """ diffs = [] for (cond1, cond2) in condition_comparison: try: @@ -115,23 +169,7 @@ def run_plotchanges(args): diffs.append(table.at[TF, "{1}_{0}_change".format(cond1, cond2)]) diffs = np.cumsum(diffs) - """ - #diffs = [table.at[TF, "{0}_{1}_change".format(cond1, cond2)] for (cond1, cond2) in condition_comparison] - - percent_bound = no_bound / table.at[TF, "total_tfbs"] * 100.0 - - #Number of bound sites - xvals = np.arange(0,len(args.conditions)) - ax1.plot(xvals, percent_bound, color=colors[i], marker="o", label=TF) - - #Annotate - ax1.annotate(TF, (xvals[0]-0.1, percent_bound[0]), color=colors[i], horizontalalignment="right", verticalalignment="center") - - ax1.set_ylabel("Percent of total sites predicted bound", color="black") - ax1.tick_params('y', colors='black') - - #Change between conditions - """ + ax2 = ax1.twinx() xvals_shift = np.arange(0.5,len(condition_comparison),1) @@ -140,19 +178,7 @@ def run_plotchanges(args): ax2.tick_params('y', colors='r') """ - #General - plt.title("Changes in TF binding across conditions") - plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') - plt.xticks(xvals, args.conditions) - plt.xlabel("Conditions") - - plt.xlim(xvals[0]-2, xvals[-1]+0.5) - - #plt.tight_layout() - plt.savefig(args.output, format="pdf", bbox_inches='tight') - #plt.show() - - + figure_pdf.close() logger.end() logger.info("Saved figure to {0}".format(args.output)) diff --git a/tobias/utils/regions.py b/tobias/utils/regions.py index 844c38e..ed34e53 100644 --- a/tobias/utils/regions.py +++ b/tobias/utils/regions.py @@ -379,14 +379,14 @@ def subtract(self, b_regions): self.loc_sort() b_regions.loc_sort() - a_len = self.count() - b_len = b_regions.count() + #a_len = self.count() + #b_len = b_regions.count() chromlist = sorted(list(set([region.chrom for region in self] + [region.chrom for region in b_regions]))) chrom_pos = {chrom:chromlist.index(chrom) for chrom in chromlist} a = self - b = b_regions + #b = b_regions a_i = 0 b_i = 0 @@ -577,33 +577,38 @@ def cluster(self, threshold=0.5, method="average"): """ Main function to cluster the overlap dictionary into clusters""" self.overlap_to_distance() - self.linkage_mat = linkage(squareform(self.distance_mat), method) - self.labels = fcluster(self.linkage_mat, threshold, criterion="distance") #ordering of the dendrogram - #Find clusters below threshold - self.linkage_clusters = dict(zip(range(self.n), [[num] for num in range(self.n)])) - for i, row in enumerate(self.linkage_mat): - ID1 = int(row[0]) - ID2 = int(row[1]) - new = self.n + i - dist = row[2] + if len(self.names) > 1: + self.linkage_mat = linkage(squareform(self.distance_mat), method) + self.labels = fcluster(self.linkage_mat, threshold, criterion="distance") #ordering of the dendrogram - if dist <= threshold: - self.linkage_clusters[new] = self.linkage_clusters[ID1] + self.linkage_clusters[ID2] + [new] - del self.linkage_clusters[ID1] - del self.linkage_clusters[ID2] + #Find clusters below threshold + self.linkage_clusters = dict(zip(range(self.n), [[num] for num in range(self.n)])) + for i, row in enumerate(self.linkage_mat): + ID1 = int(row[0]) + ID2 = int(row[1]) + new = self.n + i + dist = row[2] - #Add member-names to clusters - self.clusters = {} - for cluster in self.linkage_clusters: + if dist <= threshold: + self.linkage_clusters[new] = self.linkage_clusters[ID1] + self.linkage_clusters[ID2] + [new] + del self.linkage_clusters[ID1] + del self.linkage_clusters[ID2] - self.clusters[cluster] = {"member_idx": [idx for idx in self.linkage_clusters[cluster] if idx < self.n]} - self.clusters[cluster]["member_names"] = [self.names[idx] for idx in self.clusters[cluster]["member_idx"]] + #Add member-names to clusters + for cluster in self.linkage_clusters: + + self.clusters[cluster] = {"member_idx": [idx for idx in self.linkage_clusters[cluster] if idx < self.n]} + self.clusters[cluster]["member_names"] = [self.names[idx] for idx in self.clusters[cluster]["member_idx"]] + + else: #only one TF + self.linkage_clusters = {0:[0]} + self.linkage_mat = np.array([[0]]) + self.clusters[0] = {"member_idx":[0]} + self.clusters[0]["member_names"] = [self.names[idx] for idx in self.clusters[0]["member_idx"]] self.get_cluster_names() #Set names of clusters self.assign_colors() - - # def overlap_to_distance(self): """ Convert overlap-dict from count_overlaps to distance matrix """