diff --git a/MANIFEST.in b/MANIFEST.in index 04f196a..317846c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include README.md include LICENSE +recursive-include tobias *.c diff --git a/setup.py b/setup.py index 6aa54b9..9efc9b9 100644 --- a/setup.py +++ b/setup.py @@ -1,49 +1,84 @@ +import os +import sys +import re from setuptools import setup, Extension -import numpy as np -def readme(): - with open('README.md') as f: - return f.read() +#Test if numpy is installed +try: + import numpy as np +except: + sys.exit("ERROR: Numpy needed for TOBIAS installation. Numpy can be installed using the command \"pip install numpy\"") + +cmdclass = {} + +#Add cython modules depending on the availability of cython +try: + from Cython.Distutils import build_ext +except ImportError: + use_cython = False +else: + use_cython = True + +if use_cython: + ext_modules = [Extension("tobias.utils.ngs", ["tobias/utils/ngs.pyx"], include_dirs=[np.get_include()]), + Extension("tobias.utils.sequences", ["tobias/utils/sequences.pyx"], include_dirs=[np.get_include()]), + Extension("tobias.utils.signals", ["tobias/utils/signals.pyx"], include_dirs=[np.get_include()])] + cmdclass.update({'build_ext': build_ext}) + +else: + ext_modules = [Extension("tobias.utils.ngs", ["tobias/utils/ngs.c"], include_dirs=[np.get_include()]), + Extension("tobias.utils.sequences", ["tobias/utils/sequences.c"], include_dirs=[np.get_include()]), + Extension("tobias.utils.signals", ["tobias/utils/signals.c"], include_dirs=[np.get_include()])] -ext_modules = [Extension("tobias.utils.ngs", ["tobias/utils/ngs.pyx"], include_dirs=[np.get_include()]), - Extension("tobias.utils.sequences", ["tobias/utils/sequences.pyx"], include_dirs=[np.get_include()]), - Extension("tobias.utils.signals", ["tobias/utils/signals.pyx"], include_dirs=[np.get_include()])] +#Path of setup file to establish version +setupdir = os.path.abspath(os.path.dirname(__file__)) + +def find_version(init_file): + version_file = open(init_file).read() + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) + if version_match: + return version_match.group(1) + else: + raise RuntimeError("Unable to find version string.") + +def readme(): + with open('README.md') as f: + return f.read() setup(name='tobias', - version='0.2', - description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal', - long_description=readme(), - url='https://github.molgen.mpg.de/loosolab/TOBIAS', - author='Mette Bentsen', - author_email='mette.bentsen@mpi-bn.mpg.de', - license='MIT', - packages=['tobias', 'tobias.footprinting', 'tobias.plotting', 'tobias.motifs', 'tobias.misc', 'tobias.utils'], - entry_points = { - 'console_scripts': ['TOBIAS=tobias.TOBIAS:main'] - }, - install_requires=[ - 'setuptools_cython', - 'numpy', - 'scipy', - 'pyBigWig', - 'pysam', - 'pybedtools', - 'matplotlib>=2', - 'scikit-learn', - 'pandas', - 'pypdf2', - 'xlsxwriter', - 'adjustText', - ], - #dependency_links=['https://github.com/jhkorhonen/MOODS/tarball/master'], - classifiers = [ - 'License :: OSI Approved :: MIT License', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Programming Language :: Python :: 3' - ], - zip_safe=False, - include_package_data=True, - ext_modules = ext_modules, - scripts=["tobias/utils/peak_annotation.sh"] - ) + version=find_version(os.path.join(setupdir, "tobias", "__init__.py")), #get version from __init__.py + description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal', + long_description=readme(), + url='https://github.molgen.mpg.de/loosolab/TOBIAS', + author='Mette Bentsen', + author_email='mette.bentsen@mpi-bn.mpg.de', + license='MIT', + packages=['tobias', 'tobias.footprinting', 'tobias.plotting', 'tobias.motifs', 'tobias.misc', 'tobias.utils'], + entry_points={ + 'console_scripts': ['TOBIAS=tobias.TOBIAS:main'] + }, + ext_modules=ext_modules, + cmdclass = cmdclass, + #dependency_links=['https://github.com/jhkorhonen/MOODS/releases/download/v1.9.3/MOODS-python-1.9.3.tar.gz#egg=MOODS-python-1.9.3'], + install_requires=[ + 'numpy', + 'scipy', + 'pysam', + 'pybedtools', + 'matplotlib>=2', + 'scikit-learn', + 'pandas', + 'pypdf2', + 'xlsxwriter', + 'adjustText', + 'pyBigWig', + ], + + classifiers=[ + 'License :: OSI Approved :: MIT License', + 'Intended Audience :: Science/Research', + 'Topic :: Scientific/Engineering :: Bio-Informatics', + 'Programming Language :: Python :: 3' + ], + zip_safe=True, + ) diff --git a/tobias/TOBIAS.py b/tobias/TOBIAS.py index ac7475e..7de7974 100644 --- a/tobias/TOBIAS.py +++ b/tobias/TOBIAS.py @@ -9,6 +9,18 @@ """ import sys + +#Import extra dependencies +try: + import MOODS +except: + sys.exit("ERROR: Package MOODS is not installed and is needed by TOBIAS. You can install it using:\n" + + "$ wget https://github.com/jhkorhonen/MOODS/releases/download/v1.9.3/MOODS-python-1.9.3.tar.gz\n" + + "$ tar xzvf MOODS-python-1.9.3.tar.gz\n" + + "$ cd MOODS-python-1.9.3\n" + "$ python setup.py install") + +#Import general import argparse from argparse import SUPPRESS import textwrap @@ -23,7 +35,7 @@ from tobias.motifs.tfbscan import * from tobias.motifs.format_motifs import * -from tobias.motifs.cluster_tfbs import * +#from tobias.motifs.cluster_tfbs import * from tobias.motifs.score_bed import * from tobias.misc.subsample_bam import * @@ -32,8 +44,7 @@ #from tobias.misc.create_network import * from tobias.misc.log2table import * - -TOBIAS_VERSION = "0.2" #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Change here :-) +from tobias import __version__ as TOBIAS_VERSION def main(): @@ -43,14 +54,14 @@ def main(): { "ATACorrect":{"help":"Correct reads with regards to Tn5 sequence bias", "add_arguments": add_atacorrect_arguments, "function":run_atacorrect}, "FootprintScores":{"help":"Calculate footprint scores from cutsites", "add_arguments": add_footprint_arguments, "function":run_footprinting, "space":"\t"}, - "BINDetect":{"help":"Detect TF binding from footprints", "add_arguments": add_bindetect_arguments, "function":run_bindetect}, + "BINDetect":{"help":"Detect TF binding from footprints and motifs", "add_arguments": add_bindetect_arguments, "function":run_bindetect}, }, "Tools for working with motifs/TFBS": { "TFBScan": {"help":"Identify positions of TFBS given sequence and motifs", "add_arguments": add_tfbscan_arguments, "function": run_tfbscan}, "FormatMotifs": {"help": "Utility to deal with motif files", "add_arguments": add_formatmotifs_arguments, "function": run_formatmotifs}, - "ClusterTFBS": {"help": "Cluster TFs based on overlap of sites", "add_arguments": add_clustering_arguments, "function": run_clustering}, + #"ClusterTFBS": {"help": "Cluster TFs based on overlap of sites", "add_arguments": add_clustering_arguments, "function": run_clustering}, "ScoreBed": {"help":"Score .bed-file with signal from .bigwig-file(s)", "add_arguments": add_scorebed_arguments, "function": run_scorebed}, }, diff --git a/tobias/__init__.py b/tobias/__init__.py index e69de29..b77738f 100644 --- a/tobias/__init__.py +++ b/tobias/__init__.py @@ -0,0 +1 @@ +__version__ = "0.3.0" diff --git a/tobias/footprinting/ATACorrect.py b/tobias/footprinting/ATACorrect.py index 8a6ede7..91ba3d2 100644 --- a/tobias/footprinting/ATACorrect.py +++ b/tobias/footprinting/ATACorrect.py @@ -30,7 +30,6 @@ from collections import OrderedDict import logging import itertools -from scipy.optimize import curve_fit from matplotlib.backends.backend_pdf import PdfPages #Bio-specific packages @@ -116,11 +115,6 @@ def run_atacorrect(args): if args.peaks == None: sys.exit("Error: No .peaks-file given") - #Adjust input files to full path - #args.bam = os.path.abspath(args.bam) - #args.genome = os.path.abspath(args.genome) - #args.peaks = os.path.abspath(args.peaks) if args.peaks != None else None - #Adjust some parameters depending on input args.prefix = os.path.splitext(os.path.basename(args.bam))[0] if args.prefix == None else args.prefix args.outdir = os.path.abspath(args.outdir) if args.outdir != None else os.path.abspath(os.getcwd()) @@ -168,31 +162,12 @@ def run_atacorrect(args): logger.info("----- Processing input data -----") - #Todo: use TOBIAS functions - - #Input test logger.debug("Testing input file availability") - file_list = [args.bam, args.genome, args.peaks] - file_list = [file for file in file_list if file != None] #some files can be None depending on choice - for path in file_list: - if not os.path.exists(path): - logger.error("\nError: {0} does not exist.".format(path)) - sys.exit(1) + check_files([args.bam, args.genome, args.peaks], "r") logger.debug("Testing output directory/file writeability") make_directory(args.outdir) - if not os.access(args.outdir, os.W_OK): - logger.error("Error: {0} does not exist or is not writeable.".format(args.outdir)) - sys.exit(1) - - #Output write test - for path in output_files[:-1]: #Do not include log-file as this is managed by logger - if path == None: - continue - if os.path.exists(path): - if not os.access(path, os.W_OK): - logger.error("Error: {0} could not be opened for writing.".format(path)) - sys.exit(1) + check_files(output_files, "w") #Open pdf for figures figure_pdf = PdfPages(figures_f, keep_empty=True) @@ -368,10 +343,8 @@ def run_atacorrect(args): logger.info("Finalizing bias motif for scoring") for strand in strands: bias_obj.bias[strand].prepare_mat() - figure_pdf.savefig(plot_pssm(bias_obj.bias[strand].pssm, "Tn5 insertion bias of reads ({0})".format(strand))) - #----------------------------------------------------------------------------------------------------# # Correct read bias and write to bigwig #----------------------------------------------------------------------------------------------------# diff --git a/tobias/footprinting/ATACorrect_functions.py b/tobias/footprinting/ATACorrect_functions.py index 19d9de4..e9c0d11 100644 --- a/tobias/footprinting/ATACorrect_functions.py +++ b/tobias/footprinting/ATACorrect_functions.py @@ -16,7 +16,6 @@ import multiprocessing as mp import time from datetime import datetime - import matplotlib.pyplot as plt import matplotlib.ticker as ticker from scipy.optimize import curve_fit @@ -135,7 +134,7 @@ def bias_estimation(regions_list, params): for read in read_lst_strand[strand]: if read.cigartuples is not None: first_tuple = read.cigartuples[-1] if read.is_reverse else read.cigartuples[0] - if first_tuple[0] == 0 and first_tuple[1] > params.k_flank + max(np.abs(params.read_shift)): + if first_tuple[0] == 0 and first_tuple[1] > params.k_flank + max(np.abs(params.read_shift)): #Only include non-clipped reads read_per_pos[read.cutsite] = read_per_pos.get(read.cutsite, []) + [read] for cutsite in read_per_pos: diff --git a/tobias/footprinting/BINDetect.py b/tobias/footprinting/BINDetect.py index 368c07f..f0ca151 100644 --- a/tobias/footprinting/BINDetect.py +++ b/tobias/footprinting/BINDetect.py @@ -17,7 +17,6 @@ import time from copy import deepcopy import logging -import logging.handlers import itertools import pandas as pd @@ -43,7 +42,6 @@ from tobias.utils.sequences import * from tobias.utils.motifs import * from tobias.utils.logger import * -#from tobias.plotting.plot_bindetect import * #np.seterr(divide = 'ignore') @@ -123,7 +121,6 @@ def run_bindetect(args): outfiles.append(os.path.abspath(os.path.join(args.outdir, "*", "*_overview.xlsx"))) outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_distances.txt"))) - #outfiles.append(os.path.abspath(os.path.join(args.outdir, "TF_clusters.txt"))) outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_results.txt"))) outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_results.xlsx"))) outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_figures.pdf"))) @@ -180,11 +177,12 @@ def run_bindetect(args): #output and order titles = [] - for cond in args.cond_names: - titles.append("Score distribution of {0} scores".format(cond)) + if args.debug: + for cond in args.cond_names: + titles.append("Score distribution of {0} scores".format(cond)) - for (cond1, cond2) in comparisons: - titles.append("Background log2FCs ({0} / {1})".format(cond1, cond2)) + for (cond1, cond2) in comparisons: + titles.append("Background log2FCs ({0} / {1})".format(cond1, cond2)) for (cond1, cond2) in comparisons: titles.append("BINDetect plot ({0} / {1})".format(cond1, cond2)) @@ -195,18 +193,18 @@ def run_bindetect(args): ################# Peaks / GC in peaks ################ - #Read peak and peak_header peaks = RegionList().from_bed(args.peaks) logger.info("- Found {0} regions in input peaks".format(len(peaks))) peaks = peaks.merge() #merge overlapping peaks logger.info("- Merged to {0} regions".format(len(peaks))) + + if len(peaks) == 0: + logger.error("Input --peaks file is empty!") + sys.exit() + peak_chroms = peaks.get_chroms() peak_columns = len(peaks[0]) #number of columns - - if args.debug: - logger.info("Debug on: Peaks reduced to 1000") - peaks = peaks.subset(10000) #Make chunks of regions for multiprocessing peak_chunks = peaks.chunks(args.split) @@ -255,10 +253,6 @@ def run_bindetect(args): logger.info("- Found {0} motifs in file".format(no_pfms)) - if args.debug: - logger.info("Debug on: motifs reduced to 50") - motif_list = MotifList(motif_list[:50]) - logger.debug("Getting motifs ready") motif_list.bg = bg motif_names = [motif.name for motif in motif_list] @@ -338,7 +332,6 @@ def run_bindetect(args): results = [task.get() for task in task_list] logger.info("Done scanning for TFBS across regions!") - logger.stop_logger_queue() #stop the listening process (wait until all was written) #--------------------------------------# @@ -356,7 +349,6 @@ def run_bindetect(args): #Waits until all queues are closed writer_pool.join() - #-------------------------------------------------------------------------------------------------------------# #---------------------------- Process information on background scores and overlaps --------------------------# #-------------------------------------------------------------------------------------------------------------# @@ -411,18 +403,18 @@ def run_bindetect(args): all_log_params[bigwig] = log_params #Plot mixture - if args.debug: - plt.hist(np.log(bg_values), bins='auto', density=True) - xlim = plt.xlim() - x = np.linspace(xlim[0], xlim[1], 1000) - for i in range(2): - pdf = scipy.stats.norm.pdf(x, means[i], sds[i]) - plt.plot(x, pdf) - - logprob = gmm.score_samples(x.reshape(-1, 1)) - df = np.exp(logprob) - plt.plot(x, df) - plt.show() + #if args.debug: + # plt.hist(np.log(bg_values), bins='auto', density=True) + # xlim = plt.xlim() + # x = np.linspace(xlim[0], xlim[1], 1000) + # for i in range(2): + # pdf = scipy.stats.norm.pdf(x, means[i], sds[i]) + # plt.plot(x, pdf) + # + # logprob = gmm.score_samples(x.reshape(-1, 1)) + # df = np.exp(logprob) + # plt.plot(x, df) + # plt.show() #Mode of distribution mode = scipy.optimize.fmin(lambda x: -scipy.stats.lognorm.pdf(x, *log_params), 0, disp=False)[0] @@ -471,12 +463,14 @@ def run_bindetect(args): figures.append((ax, fig)) - #Set x-max of all plots equal - xlim = np.min([ax.get_xlim()[1] for ax, fig in figures]) - for (ax, fig) in figures: - ax.set_xlim(0,xlim) - figure_pdf.savefig(fig) - plt.close(fig) + #Only plot if args.debug is True + if args.debug: + #Set x-max of all plots equal + xlim = np.min([ax.get_xlim()[1] for ax, fig in figures]) + for (ax, fig) in figures: + ax.set_xlim(0,xlim) + figure_pdf.savefig(fig) + plt.close(fig) #Estimate pseudocount if args.pseudo == None: @@ -523,7 +517,8 @@ def run_bindetect(args): plt.xlabel("Log2 fold change") plt.ylabel("Density") - figure_pdf.savefig(fig, bbox_inches='tight') + if args.debug: + figure_pdf.savefig(fig, bbox_inches='tight') plt.close() background = None #free up space @@ -561,7 +556,6 @@ def run_bindetect(args): pool.terminate() pool.join() - #-------------------------------------------------------------------------------------------------------------# #------------------------------------------------ Cluster TFBS -----------------------------------------------# #-------------------------------------------------------------------------------------------------------------# @@ -569,6 +563,12 @@ def run_bindetect(args): clustering = RegionCluster(TF_overlaps) clustering.cluster() + #Convert full ids to alt ids + convert = {motif.name:motif.alt_name for motif in motif_list} + for cluster in clustering.clusters: + for name in convert: + clustering.clusters[cluster]["cluster_name"] = clustering.clusters[cluster]["cluster_name"].replace(name, convert[name]) + #Write out distance matrix matrix_out = os.path.join(args.outdir, args.prefix + "_distances.txt") clustering.write_distance_mat(matrix_out) @@ -586,7 +586,11 @@ def run_bindetect(args): info_table[condition + "_bound"] = info_table[condition + "_bound"].map(int) #Add cluster to info_table - cluster_names = [clustering.name2cluster[name] for name in info_table.index] + cluster_names = [] + for name in info_table.index: + for cluster in clustering.clusters: + if name in clustering.clusters[cluster]["member_names"]: + cluster_names.append(clustering.clusters[cluster]["cluster_name"]) info_table.insert(0,"cluster", cluster_names) #### Write excel ### @@ -632,7 +636,7 @@ def run_bindetect(args): motif.change = float(info_table.at[name, base + "_change"]) motif.pvalue = float(info_table.at[name, base + "_pvalue"]) - #bindetect plot + #Bindetect plot fig = plot_bindetect(comparison_motifs, clustering, [cond1, cond2], args) figure_pdf.savefig(fig, bbox_inches='tight') diff --git a/tobias/footprinting/BINDetect_functions.py b/tobias/footprinting/BINDetect_functions.py index 0193e0d..51e7ad9 100644 --- a/tobias/footprinting/BINDetect_functions.py +++ b/tobias/footprinting/BINDetect_functions.py @@ -80,14 +80,10 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): fasta_obj = pysam.FastaFile(args.genome) chrom_boundaries = dict(zip(fasta_obj.references, fasta_obj.lengths)) - gc_window = 500 - rand_window = 500 - extend = int(np.ceil(gc_window / 2.0)) + rand_window = 200 background_signal = {"gc":[], "signal":{condition:[] for condition in args.cond_names}} - #TODO: Estimate number of background positions sampled to pre-allocate space - ######## Scan for motifs in each region ###### logger.debug("Scanning for motif occurrences") all_TFBS = {TF: RegionList() for TF in motifs_obj.names} # Dict for saving sites before writing @@ -95,6 +91,10 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): logger.spam("Processing region: {0}".format(region.tup())) extra_columns = region + + #Check whether region is within boundaries + if region.end >= chrom_boundaries[region.chrom]: + print("ERROR IN REGION: {0}".format(region)) #Random positions for sampling reglen = region.get_length() @@ -112,27 +112,16 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): logger.error("Error reading footprints from region: {0}".format(region)) continue + if len(footprints[condition]) == 0: + logger.error("ERROR IN REGION: {0}".format(region)) + #Read random positions for background for pos in rand_positions: background_signal["signal"][condition].append(footprints[condition][pos]) #Scan for motifs across sequence from fasta - extended_region = copy.copy(region).extend_reg(extend) #extend to calculate gc - extended_region.check_boundary(chrom_boundaries, action="cut") - - seq = fasta_obj.fetch(region.chrom, extended_region.start, extended_region.end) - - #Calculate GC content for regions - num_sequence = nuc_to_num(seq) - Ns = num_sequence == 4 - boolean = 1 * (num_sequence > 1) # Convert to 0/1 gc - boolean[Ns] = 0.5 # replace Ns 0.5 - with neither GC nor AT - boolean = boolean.astype(np.float64) # due to input of fast_rolling_math - gc = fast_rolling_math(boolean, gc_window, "mean") - gc = gc[extend:-extend] - background_signal["gc"].extend([gc[pos] for pos in rand_positions]) - - region_TFBS = motifs_obj.scan_sequence(seq[extend:-extend], region) #RegionList of TFBS + seq = fasta_obj.fetch(region.chrom, region.start, region.end) + region_TFBS = motifs_obj.scan_sequence(seq, region) #RegionList of TFBS #Extend all TFBS with extra columns from peaks and bigwigs extra_columns = region @@ -141,7 +130,6 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): pos = TFBS.start - region.start + int(motif_length/2.0) #middle of TFBS TFBS.extend(extra_columns) - TFBS.append(gc[pos]) #Assign scores from bigwig for bigwig in args.cond_names: @@ -219,7 +207,6 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf header[6:6+no_peak_col] = ["peak_chr", "peak_start", "peak_end"] + ["additional_" + str(num + 1) for num in range(no_peak_col-3)] header[-no_cond:] = ["{0}_score".format(condition) for condition in args.cond_names] #signal scores - header[-no_cond-1] = "GC" bed_table.columns = header #Sort and format @@ -228,7 +215,7 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf bed_table[condition + "_score"] = bed_table[condition + "_score"].round(5) #### Write _all file #### - chosen_columns = [col for col in header if col != "GC"] + chosen_columns = [col for col in header] outfile = os.path.join(bed_outdir, TF_name + "_all.bed") bed_table.to_csv(outfile, sep="\t", index=False, header=False, columns=chosen_columns) @@ -320,9 +307,9 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf #Decorate ax.legend() - plt.xlabel("Log2 fold change") - plt.ylabel("Density") - plt.title("Differential binding for TF \"{0}\"\nbetween ({1} / {2})".format(TF_name, cond1, cond2)) + plt.xlabel("Log2 fold change", fontsize=8) + plt.ylabel("Density", fontsize=8) + plt.title("Differential binding for TF \"{0}\"\nbetween ({1} / {2})".format(TF_name, cond1, cond2), fontsize=10) ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() @@ -366,15 +353,16 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf #------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# -def plot_bindetect(motifs, clusters, conditions, args): - """ Conditions refer to the order of the fold_change divison, meaning condition1/condition2 +def plot_bindetect(motifs, cluster_obj, conditions, args): + """ + Conditions refer to the order of the fold_change divison, meaning condition1/condition2 - Clusters is a RegionCluster object - conditions is a tup of condition names (cond1, cond2) """ warnings.filterwarnings("ignore") cond1, cond2 = conditions - no_IDS = clusters.n + no_IDS = cluster_obj.n #Link information from motifs / clusters diff_scores = {} @@ -401,12 +389,25 @@ def plot_bindetect(motifs, clusters, conditions, args): elif diff_scores[TF]["change"] > 0: diff_scores[TF]["color"] = "red" else: - diff_scores[TF]["show"] = False + diff_scores[TF]["show"] = False diff_scores[TF]["color"] = "black" - node_color = clusters.node_color - IDS = np.array(clusters.names) + node_color = cluster_obj.node_color + IDS = np.array(cluster_obj.names) + """ + #Set cluster names + for motif_name in diff_scores: + for cluster in cluster_obj.clusters: + + if motif_name in cluster_obj.clusters[cluster]["member_names"]: + diff_scores[motif_name]["cluster_name"] = cluster_obj.clusters[cluster]["cluster_name"] + + if motif_name == cluster_obj.clusters[cluster]["representative"]: + diff_scores[TF]["show"] = True + diff_scores[motif_name]["representative"] = True + """ + #--------------------------------------- Figure --------------------------------# #Make figure @@ -445,7 +446,8 @@ def plot_bindetect(motifs, clusters, conditions, args): ax1.set_ylabel("-log10(pvalue)") ########### Dendrogram over similarities of TFs ####### - dendro_dat = dendrogram(clusters.linkage_mat, labels=IDS, no_labels=True, orientation="right", ax=ax3, above_threshold_color="black", link_color_func=lambda k: clusters.node_color[k]) + + dendro_dat = dendrogram(cluster_obj.linkage_mat, labels=IDS, no_labels=True, orientation="right", ax=ax3, above_threshold_color="black", link_color_func=lambda k: cluster_obj.node_color[k]) labels = dendro_dat["ivl"] #Now sorted for the order in dendrogram ax3.set_xlabel("Transcription factor similarities\n(Clusters below threshold are colored)") @@ -509,9 +511,49 @@ def plot_bindetect(motifs, clusters, conditions, args): adjust_text(txts, ax=ax1, text_from_points=True, arrowprops=dict(arrowstyle='-', color='black', lw=0.5)) #, expand_text=(0.1,1.2), expand_objects=(0.1,0.1)) + """ + #Add arrows to other cluster members + print(txts[0].__dict__) + label_positions = {text._text:text for text in txts} + print(label_positions) + for TF in diff_scores: + if diff_scores[TF]["show"]: + cluster_name = diff_scores[TF]["cluster_name"] + + if cluster_name in label_positions: + print(cluster_name) + + point_x, point_y = diff_scores[TF]["change"], diff_scores[TF]["log10pvalue"] + text_x, text_y = label_positions[cluster_name]._x, label_positions[cluster_name]._y + len_x, len_y = text_x - point_x, text_y - point_y + + ax1.arrow(point_x, point_y, len_x, len_y, linestyle="-", color="black", lw=0.5) + """ + #print(txts) + #Plot custom legend for colors legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="More bound in {0}".format(conditions[0])), Line2D([0],[0], marker='o', color='w', markerfacecolor="blue", label="More bound in {0}".format(conditions[1]))] ax1.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left') return(fig) + +""" +def multi_annotate(ax, s, xy_arr=[], *args, **kwargs): + ans = [] + an = ax.annotate(s, xy_arr[0], *args, **kwargs) + ans.append(an) + d = {} + try: + d['xycoords'] = kwargs['xycoords'] + except KeyError: + pass + try: + d['arrowprops'] = kwargs['arrowprops'] + except KeyError: + pass + for xy in xy_arr[1:]: + an = ax.annotate(s, xy, alpha=0.0, xytext=(0,0), textcoords=an, **d) + ans.append(an) + return ans +""" \ No newline at end of file diff --git a/tobias/footprinting/footprint_scores.py b/tobias/footprinting/footprint_scores.py index 16892ba..1041916 100644 --- a/tobias/footprinting/footprint_scores.py +++ b/tobias/footprinting/footprint_scores.py @@ -45,20 +45,24 @@ def add_footprint_arguments(parser): required = parser.add_argument_group('Required arguments') required.add_argument('-s', '--signal', metavar="", help="A .bw file of ATAC-seq cutsite signal") required.add_argument('-o', '--output', metavar="", help="Full path to output bigwig") - required.add_argument('-r', '--regions', metavar="", help="Genomic regions to run footprinting in") + required.add_argument('-r', '--regions', metavar="", help="Genomic regions to run footprinting within") optargs = parser.add_argument_group('Optional arguments') + optargs.add_argument('--score', metavar="", choices=["footprint", "sum"], help="Type of scoring to perform on cutsites (footprint/sum) (default: footprint)", default="footprint") optargs.add_argument('--extend', metavar="", type=int, help="Extend input regions with bp (default: 100)", default=100) - optargs.add_argument('--score', metavar="", choices=["tobias", "FOS", "sum"], help="Type of scoring to perform on cutsites (tobias/FOS/sum) (default: tobias)", default="tobias") - optargs.add_argument('--window', metavar="", type=int, help="For '--score tobias' this is the backround region for measuring effect of binding. For '--score sum' this is the window for calculation of sum (default: 100)", default=100) - optargs.add_argument('--fp_min', metavar="", type=int, help="Minimum footprint width (default: 20)", default=20) - optargs.add_argument('--fp_max', metavar="", type=int, help="Maximum footprint width (default: 50)", default=50) - optargs.add_argument('--flank_min', metavar="", type=int, help="For FOS; Minimum range of flanking regions (default: 10)", default=10) - optargs.add_argument('--flank_max', metavar="", type=int, help="For FOS; Maximum range of flanking regions (default: 30)", default=30) - optargs.add_argument('--smooth', metavar="", type=int, help="Smooth output signal by mean in windows (default: 5)", default=5) + optargs.add_argument('--smooth', metavar="", type=int, help="Smooth output signal by mean in windows (default: no smoothing)", default=1) optargs.add_argument('--min_limit', metavar="", type=float, help="Limit input bigwig score range (default: no lower limit)") #default none optargs.add_argument('--max_limit', metavar="", type=float, help="Limit input bigwig score range (default: no upper limit)") #default none + footprintargs = parser.add_argument_group('Parameters for score == footprint') + optargs.add_argument('--fp_min', metavar="", type=int, help="Minimum footprint width (default: 20)", default=20) + optargs.add_argument('--fp_max', metavar="", type=int, help="Maximum footprint width (default: 50)", default=50) + optargs.add_argument('--flank_min', metavar="", type=int, help="Minimum range of flanking regions (default: 10)", default=10) + optargs.add_argument('--flank_max', metavar="", type=int, help="Maximum range of flanking regions (default: 30)", default=30) + + sumargs = parser.add_argument_group('Parameters for score == sum') + optargs.add_argument('--window', metavar="", type=int, help="The window for calculation of sum (default: 100)", default=100) + runargs = parser.add_argument_group('Run arguments') runargs.add_argument('--cores', metavar="", type=int, help="Number of cores to use for computation (default: 1)", default=1) runargs.add_argument('--split', metavar="", type=int, help="Split of multiprocessing jobs (default: 100)", default=100) @@ -74,21 +78,13 @@ def calculate_scores(regions, args): chrom_lengths = {chrom:int(pybw_header[chrom]) for chrom in pybw_header} #Set flank to enable scoring in ends of regions - if args.score == "sum": - flank = int(args.window/2.0) - elif args.score == "tobias": - flank = int((args.window - args.fp_min)/2) - elif args.score == "FOS": - flank = args.flank_max - else: - flank = args.flank_max + flank = args.region_flank #Go through each region for i, region in enumerate(regions): #Extend region with necessary flank region.extend_reg(flank) - region.check_boundary(chrom_lengths, "cut") reg_key = (region.chrom, region.start+flank, region.end-flank) #output region #Get bigwig signal in region @@ -102,17 +98,14 @@ def calculate_scores(regions, args): #Calculate scores if args.score == "sum": - signal = np.abs(signal) + signal = np.abs(signal) scores = fast_rolling_math(signal, args.window, "sum") - elif args.score == "FOS": - scores = calc_FOS(signal, args.fp_min, args.fp_max, args.flank_min, args.flank_max) - - elif args.score == "tobias": - scores = tobias_footprint_array(signal, args.window, args.fp_min, args.fp_max) #numpy array + elif args.score == "footprint": + scores = tobias_footprint_array(signal, args.flank_min, args.flank_max, args.fp_min, args.fp_max) #numpy array else: - sys.exit("{0} not found".format(args.score)) + sys.exit("Scoring {0} not found".format(args.score)) #Smooth signal with args.smooth bp if args.smooth > 1: @@ -159,7 +152,7 @@ def run_footprinting(args): pybw_signal = pyBigWig.open(args.signal) pybw_header = pybw_signal.chroms() chrom_info = {chrom:int(pybw_header[chrom]) for chrom in pybw_header} - + #Decide regions logger.info("- Getting output regions ready") if args.regions: @@ -170,16 +163,24 @@ def run_footprinting(args): else: regions = RegionList().from_list([OneRegion([chrom, 0, chrom_info[chrom]]) for chrom in chrom_info]) - #Todo: check boundaries in relation to flanking regions for calculation + #Set flank to enable scoring in ends of regions + if args.score == "sum": + args.region_flank = int(args.window/2.0) + elif args.score == "footprint": + args.region_flank = int(args.flank_max) + #Go through each region + for i, region in enumerate(regions): + region.extend_reg(args.region_flank) + region.check_boundary(chrom_info, "cut") + region.start = region.start + args.region_flank + region.end = region.end - args.region_flank #Information for output bigwig reference_chroms = sorted(list(chrom_info.keys())) header = [(chrom, chrom_info[chrom]) for chrom in reference_chroms] regions.loc_sort(reference_chroms) - - #---------------------------------------------------------------------------------------# #------------------------ Calculating footprints and writing out -----------------------# #---------------------------------------------------------------------------------------# diff --git a/tobias/misc/log2table.py b/tobias/misc/log2table.py index 81fd4c8..6153f5d 100644 --- a/tobias/misc/log2table.py +++ b/tobias/misc/log2table.py @@ -9,7 +9,6 @@ """ - import os import sys import argparse @@ -21,11 +20,10 @@ from tobias.utils.logger import * - def add_log2table_arguments(parser): parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) - description = "" + description = "Log2Table creates tables of footprint depth (FPD) and aggregate correlations from the PlotAggregate logfiles." parser.description = format_help_description("Log2Table", description) parser._action_groups.pop() #pop -h @@ -33,7 +31,7 @@ def add_log2table_arguments(parser): #Required arguments required = parser.add_argument_group('Required arguments') required.add_argument('--logfiles', nargs="*", metavar="", help="Logfiles from PlotAggregate") - required.add_argument('--output', metavar="", help="Output directory for tables (default: current dir)", default=".") + required.add_argument('--outdir', metavar="", help="Output directory for tables (default: current dir)", default=".") required.add_argument('--prefix', metavar="", help="Prefix of output files", default="aggregate") return(parser) @@ -42,12 +40,13 @@ def run_log2table(args): logger = TobiasLogger("Log2Table") logger.begin() - #Test input + #Test input / output check_required(args, ["logfiles"]) - - output_fpd = os.path.join(args.output, args.prefix + "_FPD.txt") - output_corr = os.path.join(args.output, args.prefix + "_CORRELATION.txt") - + + make_directory(args.outdir) + output_fpd = os.path.join(args.outdir, args.prefix + "_FPD.txt") + output_corr = os.path.join(args.outdir, args.prefix + "_CORRELATION.txt") + FPD_data = [] CORR_data = [] @@ -56,6 +55,7 @@ def run_log2table(args): logger.info("Reading: {0}".format(log_f)) with open(log_f) as f: for line in f: + #Match FPD lines #...... FPD (PWM_uncorrected,MA0073.1_all): 20 0.620 0.602 -0.018 m = re.match(".*FPD\s\((.+),(.+)\): (.+) (.+) (.+) (.+)", line.rstrip()) @@ -82,6 +82,8 @@ def run_log2table(args): columns = [signal2, sites2, signal1, sites1, pearsonr] CORR_data.append(columns) + logger.info("Writing tables") + #All lines from all files read, write out tables df_fpd = pd.DataFrame(FPD_data) df_fpd.drop_duplicates(keep="first", inplace=True) @@ -90,3 +92,5 @@ def run_log2table(args): df_corr = pd.DataFrame(CORR_data) df_corr.drop_duplicates(keep="first", inplace=True) df_corr.to_csv(output_corr, sep="\t", index=False, header=False) + + logger.end() diff --git a/tobias/misc/maxpos.py b/tobias/misc/maxpos.py index ef2c498..2921142 100644 --- a/tobias/misc/maxpos.py +++ b/tobias/misc/maxpos.py @@ -46,7 +46,7 @@ def add_maxpos_arguments(parser): #Optional arguments optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--output', metavar="", help="Path to output .bed-file (default: scored sites are written to stdout") + optional.add_argument('--output', metavar="", help="Path to output .bed-file (default: scored sites are written to stdout)") optional.add_argument('--invert', help="Find minimum position instead of maximum position", action='store_true', default=False) return(parser) diff --git a/tobias/misc/subsample_bam.py b/tobias/misc/subsample_bam.py index 13920ca..280af96 100644 --- a/tobias/misc/subsample_bam.py +++ b/tobias/misc/subsample_bam.py @@ -40,10 +40,10 @@ def add_subsample_arguments(parser): #Required arguments args = parser.add_argument_group('Input arguments') args.add_argument('--bam', metavar="", help="Path to .bam-file") - args.add_argument('--no_rand', metavar="", type=int, help="Number of randomizations (per step)", default=3) + args.add_argument('--no_rand', metavar="", type=int, help="Number of randomizations (per step) (default: 3)", default=3) args.add_argument('--start', metavar="", type=int, help="Start of percent subsample (default: 0)", default=0) args.add_argument('--end', metavar="", type=int, help="End of percent subsample (default: 100)", default=100) - args.add_argument('--step', metavar="", type=int, help="Step between --start and --end (default: 5)", default=5) + args.add_argument('--step', metavar="", type=int, help="Step between --start and --end (default: 10)", default=10) args.add_argument('--cores', metavar="", type=int, help="Cores for multiprocessing (default: 1)", default=1) args.add_argument('--outdir', metavar="", help="Output directory (default: current working directory)", default=".") args.add_argument('--prefix', metavar="", help="Prefix for output files (default: prefix of .bam)") diff --git a/tobias/utils/logger.py b/tobias/utils/logger.py index ff30833..cd32a65 100644 --- a/tobias/utils/logger.py +++ b/tobias/utils/logger.py @@ -13,10 +13,10 @@ import os from datetime import datetime import logging +import logging.handlers import multiprocessing as mp import time - def add_logger_args(args): """ Function for adding TOBIAS-wide verbosity to command-line parsers """ args.add_argument('--verbosity', metavar="", help="Level of output logging (0: silent, 1: errors/warnings, 2: info, 3: stats, 4: debug, 5: spam) (default: 3)", choices=[0,1,2,3,4,5], default=3, type=int) @@ -24,7 +24,7 @@ def add_logger_args(args): class TobiasLogger(logging.Logger): """ TobiasLogger is an instance of a logging.Logger with special functions for formatting and creating automatic logging """ - + logger_levels = { 0: 0, 1: logging.WARNING, #also includes errors @@ -94,9 +94,10 @@ def __init__(self, tool_name="TOBIAS", level=3, queue=None): def begin(self): """ Begin logging by writing comments about the current run """ + from tobias import __version__ as TOBIAS_VERSION #Print info on run - self.comment("# TOBIAS {0} (run started {1})".format(self.tool_name, self.begin_time)) + self.comment("# TOBIAS {0} {1} (run started {2})".format(TOBIAS_VERSION, self.tool_name, self.begin_time)) self.comment("# Working directory: {0}".format(os.getcwd())) self.comment("# Command line call: {0}\n".format(" ".join(sys.argv))) @@ -186,7 +187,7 @@ def output_files(self, outfiles): class TOBIASFormatter(logging.Formatter): - + """ Formatter class used in TobiasLogger """ default_fmt = logging.Formatter("%(asctime)s (%(process)d) [%(levelname)s]\t%(message)s", "%Y-%m-%d %H:%M:%S") comment_fmt = logging.Formatter("%(message)s") diff --git a/tobias/utils/motifs.py b/tobias/utils/motifs.py index 083c713..09e743b 100644 --- a/tobias/utils/motifs.py +++ b/tobias/utils/motifs.py @@ -70,6 +70,7 @@ def scan_sequence(self, seq, region): #Scan sequence results = self.moods_scanner.scan(seq) + #Convert results to RegionList sites = RegionList() for (matrix, name, strand, result) in zip(self.matrices, self.names, self.strands, results): motif_length = len(matrix[0]) @@ -95,7 +96,6 @@ def __init__(self, motifid, alt_name, counts): self.alt_name = alt_name #does not have to be unique self.name = "" #name set in set_name - #self.out_name self.counts = counts #counts self.strand = "+" #default strand is + @@ -112,6 +112,7 @@ def __str__(self): def set_name(self, naming="name_id"): """ Set name to be used in 4th column """ + if naming == "name": prefix = self.alt_name elif naming == "id": @@ -186,10 +187,9 @@ def plot_logo(self): 'T': 'darkgreen'} def add_letter(base, x, y, scale, ax): - """ """ + """ Add letter to axis at positions x/y""" text = LETTERS[base] - t = mpl.transforms.Affine2D().scale(1*globscale, scale*globscale) + \ mpl.transforms.Affine2D().translate(x,y) + ax.transData p = PathPatch(text, lw=0, fc=COLOR_SCHEME[base], transform=t) @@ -395,6 +395,5 @@ def pfm_to_motifs(content): continue #todo: check correct format of pfms - return(motiflist) diff --git a/tobias/utils/peak_annotation.sh b/tobias/utils/peak_annotation.sh deleted file mode 100644 index 8d07a0c..0000000 --- a/tobias/utils/peak_annotation.sh +++ /dev/null @@ -1,148 +0,0 @@ -#!/bin/bash - -# author afust - -# capabilities: -# peak annotation - -# requirements: -# UROPA -# py2 -# R - -usage() { - echo -e "" - echo -e "Usage: $0 [options]" - echo -e "Mandatory" - echo -e " -i|--bed [] input bed file for annotation" - echo -e " -o|--annotation [] peak annotation file" - echo -e " -g|--gtf [] gtf file for annotation, check filter attributes" - echo -e "Optional" - echo -e " -p|--header [bed_header.txt] peak annotation header file" - echo -e " -l|--log [input.log] log file" - echo -e " -c|--cores [1] number of cores" - #echo -e " -j|--universe [] gene universe, can be used for enrichment" - echo -e "For UROPA params check: https://uropa-manual.readthedocs.io/config.html" - echo -e " -t|--feature [gene] UROPA param" - echo -e " -r|--feature_anchor [start] UROPA param" - echo -e " -e|--distance [[10000,1000]] UROPA param" - echo -e " -a|--filter_attribute [] UROPA param" - echo -e " -w|--attribute_value [] UROPA param" - echo -e " -n|--show_attribute [gene_name,gene_id,gene_biotype] UROPA param" - exit 1 -} -export LC_ALL=C - -# defaults -cores=1 -feature="gene" -feature_anchor="start" -distance=[10000,1000] -#filter_attribute="gene_biotype" -#attribute_value="protein_coding" -show_attribute='"gene_biotype", "gene_name", "gene_id"' - -# no option given -if [ $# -eq 0 ]; then - usage - exit -1 -fi - -# Process command line options -progname=${0##*/} -shortopts="i:o:j:p:l:c:g:t:r:e:a:w:n:h" -longopts="bed:,annotation:,universe:,header:,log:,cores:,gtf:,feature:,feature_anchor:,distance:,filter_attribute:,attribute_value:,show_attribute:" -opts=$(getopt -s bash --options "$shortopts" --longoptions "$longopts" --name "$progname" -- "$@") - -eval set -- ${opts} -while true; do - case "$1" in - ( -- ) break ;; - ( -h|--help ) usage ;; - ( -i|--bed ) bed=$2; shift ;; - ( -o|--annotation ) annotation=$2; shift ;; - ( -j|--universe ) universe=$2; shift ;; - ( -p|--header ) header=$2; shift ;; - ( -l|--log ) log=$2; shift ;; - ( -c|--cores ) cores=$2; shift ;; - ( -g|--gtf ) gtf=$2; shift ;; - ( -t|--feature ) feature=$2; shift ;; - ( -r|--feature_anchor ) feature_anchor=$2; shift ;; - ( -e|--distance ) distance=$2; shift ;; - ( -a|--filter_attribute ) filter_attribute=$2; shift ;; - ( -w|--attribute_value ) attribute_value=$2; shift ;; - ( -n|--show_attribute ) show_attribute=$2; shift ;; - esac - shift -done -# process params -if [[ -z $bed ]] || [[ -z $annotation ]] || [[ -z $gtf ]]; then - echo -e "bed file or gtf for annotation or output missing: mandatory!" - usage - exit -1 -fi -if [[ -z $header ]]; then - tmp=$(cut -f1 -d'.' $(basename ${annotation}))'_header.txt' - header=$(dirname ${annotation})'/'$tmp -fi -if [[ -z $log ]]; then - log=$(basename $bed .bed)'.log' -fi - -if [[ ${show_attribute:0:1} != '"' ]]; then - show_attribute=$(echo $show_attribute | sed -r 's/[,]/","/g' | sed -e 's/.*/"&"/') -fi - -prefix="$(dirname $annotation)/$(basename $annotation .bed)" -json="${prefix}.json" - - -echo -e "Started peak annotation:" 2>&1 | tee -a $log -# create json config for uropa annotation -if [[ ! -z $filter_attribute ]] && [[ ! -z $attribute_value ]]; then - cat > $json << EOL -{ - "queries":[ - {"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "filter.attribute":"$filter_attribute", "attribute.value":"$attribute_value", "show.attributes":[$show_attribute]}, - {"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "show.attributes":[$show_attribute]} - ], - "priority": "True", - "gtf": "$gtf", - "bed": "$bed" -} -EOL -else - cat > $json << EOL -{ - "queries":[ - {"feature":"$feature", "feature.anchor":"$feature_anchor", "distance":$distance, "show.attributes":[$show_attribute]} - ], - "gtf": "$gtf", - "bed": "$bed" -} -EOL -fi - -# uropa annotation -echo -e "uropa -i $json -l $log -t $cores -p $prefix -d -s" 2>&1 | tee -a $log -uropa -i $json -l $log -t $cores -p $prefix -d -s 2>&1 | tee -a $log - - -# build peaks with annotation -# create final header of all merged annotated -attributes=$(($(head -1 ${prefix}_finalhits.txt | awk '{print NF}')-1)) -echo -e "attributes: $attributes" -`head -1 ${prefix}_finalhits.txt | awk 'BEGIN { OFS = "\n" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' - > $header` -`head -1 ${prefix}_finalhits.txt | cut -f15-${attributes} - | sed -e 's/\t/\n/g' >> $header` -# extract columns of interest -`awk 'BEGIN { OFS = "\t" } ; { print $2,$3,$5,$1,$6,$7,$8,$9,$10,$11 }' ${prefix}_finalhits.txt | sed -e 1d | sed -e 's/\t\t/\t/g' > $annotation'.tmp'` -cut -f15-${attributes} ${prefix}_finalhits.txt | sed -e 1d | paste -d"\t" $annotation'.tmp' - > $annotation -sort -k1,1 -k2,2n $annotation > $annotation'.tmp' -mv $annotation'.tmp' $annotation - -# add in case of go enrichment analysis -#echo -e "universe" -#col=$(head -1 ${prefix}_allhits.txt | tr -s '\t' '\n' | nl -nln | grep "gene_id" | cut -f1 | tail -1) -#cut -f${col} ${prefix}_allhits.txt | grep -v "NA" | uniq > $universe - -echo -e "Finished peak annotation:" 2>&1 | tee -a $log diff --git a/tobias/utils/regions.py b/tobias/utils/regions.py index 83db832..13568da 100644 --- a/tobias/utils/regions.py +++ b/tobias/utils/regions.py @@ -569,9 +569,9 @@ class RegionCluster: def __init__(self, overlap_dict): self.overlaps = overlap_dict #overlap dict is from RegionList.count_overlaps - + #Added later - self.clusters = {} # ID:{"member_idx":[], "member_names":[], "cluster_name":""} + self.clusters = {} # ID:{"member_idx":[], "member_names":[], "cluster_name":"", "representative"} def cluster(self, threshold=0.5, method="average"): """ Main function to cluster the overlap dictionary into clusters""" @@ -603,6 +603,8 @@ def cluster(self, threshold=0.5, method="average"): self.get_cluster_names() #Set names of clusters self.assign_colors() + # + def overlap_to_distance(self): """ Convert overlap-dict from count_overlaps to distance matrix """ @@ -638,33 +640,37 @@ def get_cluster_names(self): self.name2cluster = {} #Sort clusters by distance scores to other motifs in cluster - cluster_i = 1 + #cluster_i = 1 for cluster in self.clusters: members_idx = self.clusters[cluster]["member_idx"] - ### Code to assign a specific name to clusters - """ - members_idx = self.clusters[cluster]["member_idx"] - members_names = self.clusters[cluster]["member_names"] - distances = {} - for member in members_idx: - distances[member] = np.sum(self.distance_mat[member,:]) #0 if member is one-member cluster + if len(members_idx) > 1: - #Sort cluster by distance; smallest values=most representative first - ordered_idx = sorted(distances.keys(), key=lambda member: distances[member]) + ### Code to assign a specific name to clusters + members_idx = self.clusters[cluster]["member_idx"] + members_names = self.clusters[cluster]["member_names"] + distances = {} + for member in members_idx: + distances[member] = np.sum(self.distance_mat[member,:]) #0 if member is one-member cluster - self.cluster_names[cluster] = "C_" + self.names[ordered_idx[0]] #cluster is the idx for cluster - """ + #Sort cluster by distance; smallest values=most representative first + ordered_idx = sorted(distances.keys(), key=lambda member: distances[member]) - self.cluster_names[cluster] = "C_{0}".format(cluster_i) - cluster_i += 1 + self.clusters[cluster]["representative"] = self.names[ordered_idx[0]] + self.cluster_names[cluster] = "C_" + self.names[ordered_idx[0]] + "_" + str(len(members_idx)) #cluster is the idx for cluster + else: + self.cluster_names[cluster] = self.clusters[cluster]["member_names"][0] + self.clusters[cluster]["representative"] = self.cluster_names[cluster] + + self.clusters[cluster]["cluster_name"] = self.cluster_names[cluster] #Assign every member to its cluster for member in members_idx: self.name2cluster[self.names[member]] = self.cluster_names[cluster] + def write_distance_mat(self, out_f): """ Writes out distance matrix to out_f file """ np.savetxt(out_f, self.distance_mat, delimiter="\t", header="\t".join(self.names), fmt="%.4f") diff --git a/tobias/utils/signals.pyx b/tobias/utils/signals.pyx index a472bbf..b660d8f 100644 --- a/tobias/utils/signals.pyx +++ b/tobias/utils/signals.pyx @@ -47,7 +47,7 @@ def shuffle_array(np.ndarray[np.float64_t, ndim=1] arr, int no_rand, np.ndarray[ -#------------------------------------------------------------------------------------------------# +#--------------------------------------------------------------------------------------------------# @cython.boundscheck(False) #dont check boundaries @cython.cdivision(True) #no check for zero division @cython.wraparound(False) #dont deal with negative indices @@ -129,7 +129,58 @@ def fast_rolling_math(np.ndarray[np.float64_t, ndim=1] arr, int w, operation): return roll_arr -#------------------------------------------------------------------------------------------------# +#--------------------------------------------------------------------------------------------------# +@cython.boundscheck(False) #dont check boundaries +@cython.cdivision(True) #no check for zero division +@cython.wraparound(False) #dont deal with negative indices +def tobias_footprint_array(np.ndarray[np.float64_t, ndim=1] arr, int flank_min, int flank_max, int fp_min, int fp_max): + + cdef int L = arr.shape[0] + cdef np.ndarray[np.float64_t, ndim=1] footprint_scores = np.zeros(L) + cdef int i, j, footprint_w, flank_w + + cdef float fp_sum, fp_mean, fp_score, flank_mean + cdef float left_sum, right_sum + + #Each position in array, starting at i, going until last possible start of region + for i in range(L-2*flank_max-fp_max): + for flank_w in range(flank_min, flank_max+1): + for footprint_w in range(fp_min, fp_max+1): + + #Sum of left flank + left_sum = 0 + for j in range(flank_w): + if arr[i+j] > 0: + left_sum += arr[i+j] + + #Sum of footprint + fp_sum = 0 + for j in range(footprint_w): + if arr[i+j] < 0: + fp_sum += arr[i+flank_w+j] + + #Sum of right flank + right_sum = 0 + for j in range(flank_w): + if arr[i+j] > 0: + right_sum += arr[i+flank_w+footprint_w+j] + + #Calculate score + fp_mean = fp_sum/(1.0*footprint_w) #will be minus + flank_mean = (right_sum + left_sum)/(2.0*flank_w) + + fp_score = flank_mean - fp_mean #-- = + + + #Save score across footprint + for pos in range(i+flank_w, i+flank_w+footprint_w): + if fp_score > footprint_scores[pos]: + footprint_scores[pos] = fp_score + + return(footprint_scores) + + +""" +#--------------------------------------------------------------------------------------------------# def calc_FOS(np.ndarray[np.float64_t, ndim=1] arr, int fp_min, int fp_max, int flank_min, int flank_max): cdef int L = arr.shape[0] @@ -175,57 +226,4 @@ def calc_FOS(np.ndarray[np.float64_t, ndim=1] arr, int fp_min, int fp_max, int f footprint_scores[i+flank_w+j] = fos_score return(footprint_scores) - - - -#------------------------------------------------------------------------------------------------# -@cython.boundscheck(False) #dont check boundaries -@cython.cdivision(True) #no check for zero division -@cython.wraparound(False) #dont deal with negative indices -def tobias_footprint_array(np.ndarray[np.float64_t, ndim=1] arr, int window, int fp_min, int fp_max): - - cdef int L = arr.shape[0] - cdef np.ndarray[np.float64_t, ndim=1] footprint_scores = np.zeros(L) - cdef int i, j, footprint_w, flank_w, flank_left, flank_right - - cdef float fp_sum, fp_mean, fp_score, flank_mean - cdef float left_sum, right_sum - - cdef int flank_max = int((window - fp_min)/2) - - #Each position in array, starting at i, going until last possible start of region - for i in range(L-2*flank_max-fp_max): - for footprint_w in range(fp_min, fp_max+1): - flank_left = int((window - footprint_w)/2) - flank_right = window - flank_left - footprint_w - - #Sum of left flanking window - left_sum = 0 - for j in range(flank_left): - if arr[i+j] > 0: - left_sum += arr[i+j] - - #Sum of footprint - fp_sum = 0 - for j in range(footprint_w): - if arr[i+j] < 0: - fp_sum += arr[i+flank_left+j] - - #Sum of right flank - right_sum = 0 - for j in range(flank_right): - if arr[i+j] > 0: - right_sum += arr[i+flank_left+footprint_w+j] - - #Calculate score - fp_mean = fp_sum/(1.0*footprint_w) #will be minus - flank_mean = (right_sum + left_sum)/(flank_right + flank_left) - - fp_score = flank_mean - fp_mean #-- = + - - #Save score across footprint - for pos in range(i+flank_left, i+flank_left+footprint_w): - if fp_score > footprint_scores[pos]: - footprint_scores[pos] = fp_score - - return(footprint_scores) +""" \ No newline at end of file diff --git a/tobias/utils/utilities.py b/tobias/utils/utilities.py index 3b39130..8bd34d3 100644 --- a/tobias/utils/utilities.py +++ b/tobias/utils/utilities.py @@ -118,7 +118,6 @@ def bigwig_writer(q, key_file_dict, header, regions, args): #todo: check if args.log_q exists logger = TobiasLogger("", args.verbosity, args.log_q) #separate core, needs mp logger through queue logger.debug("Opened bigwig writer process for {0}".format(key_file_dict)) - logger.debug("Header: {0}".format(header)) handles = {} @@ -141,8 +140,6 @@ def bigwig_writer(q, key_file_dict, header, regions, args): sorted_region_tups = sorted(region_tups, key=lambda tup: (order_dict[tup[0]], tup[1])) #sort to same order as bigwig header no_regions = len(region_tups) - #logger.debug("Order of regions: {0}".format(list(set([tup[0] for tup in sorted_region_tups])))) - #Fetching content from queue logger.debug("Fetching content from queue") @@ -164,6 +161,8 @@ def bigwig_writer(q, key_file_dict, header, regions, args): #Save key:region:signal to ready_to_write ready_to_write[key][region] = signal + writing_progress = Progress(no_regions, logger, prefix="Writing progress", round=0) + #Check if next-to-write region was done for key in handles: @@ -196,11 +195,9 @@ def bigwig_writer(q, key_file_dict, header, regions, args): i_to_write[key] += 1 next_region = sorted_region_tups[i_to_write[key]] #this is the region to be written next for this key - #Write out progress (and only once per step) - - #Write out progress - #write out progress only after writing something - #check if i_to_write[key] is a certain percentage of len(regions) + #Writing progress + #progress = sum([i_to_write[key] for key in handles]) + #writing_progress.write(progress) except Exception: import sys, traceback @@ -211,7 +208,6 @@ def bigwig_writer(q, key_file_dict, header, regions, args): return(1) - def monitor_progress(task_list, logger, prefix="Progress"): prev_done = 0 @@ -233,11 +229,11 @@ def monitor_progress(task_list, logger, prefix="Progress"): return() #doesn't return until the while loop exits - #-------------------------------------------------------------------------------------------# #------------------------------------- Argparser -------------------------------------------# #-------------------------------------------------------------------------------------------# + def restricted_float(f, f_min, f_max): f = float(f) if f < f_min or f > f_max: @@ -249,7 +245,6 @@ def restricted_int(integer, i_min, i_max): if integer < i_min or integer > i_max: raise - def format_help_description(name, description, width=90): """ Format description of command line tool --help description """ @@ -272,16 +267,42 @@ def format_help_description(name, description, width=90): return(formatted) -#-------------------------------------------------------------------------------------------# -#---------------------------------------- Misc ---------------------------------------------# -#-------------------------------------------------------------------------------------------# - def check_required(args, required): + """ Checks required keys in input args """ for arg in required: if getattr(args, arg) == None: sys.exit("ERROR: Missing argument --{0}".format(arg)) +#-------------------------------------------------------------------------------------------# +#---------------------------------------- Misc ---------------------------------------------# +#-------------------------------------------------------------------------------------------# + + +class Progress: + """ Class for writing out progress of processes such as multiprocessing """ + def __init__(self, total_elements, logger, prefix="Progress", round=0): + + self.total_elements = total_elements + self.logger = logger + self.prefix = prefix + self.round = round + self.last_written = -1 + + def write(self, progress): + """ Write out progress if it was not already written """ + + percent_to_write = progress / self.total_elements * 100 + if self.round == 0: + percent_to_write = round(percent_to_write) + else: + percent_to_write = round(percent_to_write, self.round) + + #Only write if this level has not already been written + if percent_to_write != self.last_written: + self.logger.info("{0}: {1}%".format(self.prefix, percent_to_write)) + self.last_written = percent_to_write + def flatten_list(lst): @@ -344,9 +365,9 @@ def merger(dct, dct_to_add): return(out_dict) +def filafy(astring): + """ Make string into accepted filename """ -def filafy(astring): #Make name into accepted filename - valid_chars = "-_.%s%s" % (string.ascii_letters, string.digits) filename = ''.join(char for char in astring if char in valid_chars) return(filename)