diff --git a/setup.py b/setup.py index 9adc8cc..6aa54b9 100644 --- a/setup.py +++ b/setup.py @@ -10,14 +10,14 @@ def readme(): Extension("tobias.utils.signals", ["tobias/utils/signals.pyx"], include_dirs=[np.get_include()])] setup(name='tobias', - version='0.1', + version='0.2', description='Transcription factor Occupancy prediction By Investigation of ATAC-seq Signal', long_description=readme(), url='https://github.molgen.mpg.de/loosolab/TOBIAS', author='Mette Bentsen', author_email='mette.bentsen@mpi-bn.mpg.de', license='MIT', - packages=['tobias', 'tobias.footprinting', 'tobias.utils', 'tobias.plotting', 'tobias.motifs'], + packages=['tobias', 'tobias.footprinting', 'tobias.plotting', 'tobias.motifs', 'tobias.misc', 'tobias.utils'], entry_points = { 'console_scripts': ['TOBIAS=tobias.TOBIAS:main'] }, diff --git a/snakemake_pipeline/TOBIAS.snake b/snakemake_pipeline/TOBIAS.snake index 6219e01..e7eef8e 100644 --- a/snakemake_pipeline/TOBIAS.snake +++ b/snakemake_pipeline/TOBIAS.snake @@ -130,8 +130,8 @@ if len(CONDITION_IDS) > 1: output_files.append(expand(os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS)) -output_files.append(os.path.join(OUTPUTDIR, "TFBS", "bindetect_results.txt")) -output_files.append(os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt")) +#output_files.append(os.path.join(OUTPUTDIR, "TFBS", "bindetect_results.txt")) +#output_files.append(os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt")) #Visualization output_files.extend(expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_{plotname}.pdf"), TF=TF_IDS, plotname=PLOTNAMES)) diff --git a/snakemake_pipeline/TOBIAS_example.config b/snakemake_pipeline/TOBIAS_example.config index 6045606..9f394a5 100644 --- a/snakemake_pipeline/TOBIAS_example.config +++ b/snakemake_pipeline/TOBIAS_example.config @@ -28,5 +28,5 @@ uropa: "--feature gene --feature_anchor start --distance [10000,1000] --filter_a atacorrect: "" footprinting: "" -bindetect: "" +bindetect: "--prefix immune" plotting: "" diff --git a/snakemake_pipeline/environments/tobias.yaml b/snakemake_pipeline/environments/tobias.yaml index 30e55ee..8775ad2 100644 --- a/snakemake_pipeline/environments/tobias.yaml +++ b/snakemake_pipeline/environments/tobias.yaml @@ -5,6 +5,7 @@ channels: - conda-forge dependencies: + - python=3 - pysam - pybigwig - moods @@ -22,6 +23,5 @@ dependencies: - openjdk - xlsxwriter - cloudpickle=0.5.6 - - seaborn - pip: - adjustText \ No newline at end of file diff --git a/snakemake_pipeline/snakefiles/footprinting.snake b/snakemake_pipeline/snakefiles/footprinting.snake index 6676bbc..2fde401 100644 --- a/snakemake_pipeline/snakefiles/footprinting.snake +++ b/snakemake_pipeline/snakefiles/footprinting.snake @@ -73,9 +73,8 @@ rule bindetect: output: bedfiles = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_{suffix}.bed"), TF=TF_IDS, suffix=["all"] + expand("{condition}_{state}", condition=CONDITION_IDS, state=["bound", "unbound"])), overview = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "{TF}_overview.txt"), TF=TF_IDS), - figures = os.path.join(OUTPUTDIR, "TFBS", "bindetect_figures.pdf"), - local = [os.path.join(OUTPUTDIR, "TFBS", fil) for fil in ["bindetect_results.txt", "bindetect_results.xlsx"]], - + #figures = os.path.join(OUTPUTDIR, "TFBS", "bindetect_figures.pdf"), + #local = [os.path.join(OUTPUTDIR, "TFBS", fil) for fil in ["bindetect_results.txt", "bindetect_results.xlsx"]], threads: 99 priority: 2 log: @@ -88,7 +87,11 @@ rule bindetect: "Running BINDetect" shell: "TOBIAS BINDetect --motifs {input.motifs} --signals {input.footprints} --genome {input.genome} --peaks {input.peaks} --peak_header {input.peak_header} --cores {threads} {params} &> {log}; " - + "mkdir -p " + os.path.join(OUTPUTDIR, "overview") + ";" + "mv " + os.path.join(OUTPUTDIR, "TFBS", "*.txt") + " " + os.path.join(OUTPUTDIR, "overview") + ";" #move files to overview + "mv " + os.path.join(OUTPUTDIR, "TFBS", "*.xlsx") + " " + os.path.join(OUTPUTDIR, "overview") + ";" + "mv " + os.path.join(OUTPUTDIR, "TFBS", "*.pdf") + " " + os.path.join(OUTPUTDIR, "overview") + ";" +""" rule copy_to_overview: input: [os.path.join(OUTPUTDIR, "TFBS", fil) for fil in ["bindetect_results.txt", "bindetect_results.xlsx"]], @@ -96,4 +99,5 @@ rule copy_to_overview: [os.path.join(OUTPUTDIR, "overview", fil) for fil in ["bindetect_results.txt", "bindetect_results.xlsx"]] priority: 2 shell: - "cp " + os.path.join(OUTPUTDIR, "TFBS", "bindetect_*") + " " + os.path.join(OUTPUTDIR, "overview") + "mv " + os.path.join(OUTPUTDIR, "TFBS", "*.*") + " " + os.path.join(OUTPUTDIR, "overview") #move files to overview +""" \ No newline at end of file diff --git a/snakemake_pipeline/snakefiles/visualization.snake b/snakemake_pipeline/snakefiles/visualization.snake index 2d2c38a..3acf6c6 100644 --- a/snakemake_pipeline/snakefiles/visualization.snake +++ b/snakemake_pipeline/snakefiles/visualization.snake @@ -47,13 +47,15 @@ rule plot_aggregate_within: signals = [os.path.join(OUTPUTDIR, "bias_correction", "{condition}_" + state + ".bw") for state in ["uncorrected", "expected", "corrected"]], output: os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_{condition}_aggregate.pdf") + log: + os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "logs", "{TF}_{condition}_aggregate.log") message: "Plotting split between bound/unbound around TFBS for TF \"{wildcards.TF}\" in condition \"{wildcards.condition}\"" params: "--title 'Bias correction and split for {TF} in condition {condition}'", - "--share_y rows", + "--share_y sites", "--plot_boundaries", shell: - "TOBIAS PlotAggregate --TFBS {input.TFBS} --signals {input.signals} --output {output} {params} >/dev/null " + "TOBIAS PlotAggregate --TFBS {input.TFBS} --signals {input.signals} --output {output} {params} > {log} " #Aggregates across conditions for all and for bound subsets @@ -64,7 +66,9 @@ rule plot_aggregate_across: signals = expand(os.path.join(OUTPUTDIR, "bias_correction", "{condition}_corrected.bw"), condition=CONDITION_IDS), output: all_compare = os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_aggregate_comparison_all.pdf"), - bound_compare = os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_aggregate_comparison_bound.pdf") + bound_compare = os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "{TF}_aggregate_comparison_bound.pdf"), + all_log = os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "logs", "{TF}_aggregate_comparison_all.log"), + bound_log = os.path.join(OUTPUTDIR, "TFBS", "{TF}", "plots", "logs", "{TF}_aggregate_comparison_bound.log") priority: 2 params: "--title {0}".format("{TF}"), @@ -72,8 +76,8 @@ rule plot_aggregate_across: "--share_y both", message: "Plotting comparison of cutsite signals for \"{wildcards.TF}\" between conditions" shell: - "TOBIAS PlotAggregate --TFBS {input.TFBS_all} --signals {input.signals} --output {output.all_compare} {params} >/dev/null; " - "TOBIAS PlotAggregate --TFBS {input.TFBS_bound} --signals {input.signals} --output {output.bound_compare} {params} >/dev/null;" + "TOBIAS PlotAggregate --TFBS {input.TFBS_all} --signals {input.signals} --output {output.all_compare} {params} > {output.all_log}; " + "TOBIAS PlotAggregate --TFBS {input.TFBS_bound} --signals {input.signals} --output {output.bound_compare} {params} > {output.bound_log};" #----------------------------------------------------------------# diff --git a/tobias/TOBIAS.py b/tobias/TOBIAS.py index 0dab503..b04a693 100644 --- a/tobias/TOBIAS.py +++ b/tobias/TOBIAS.py @@ -19,17 +19,19 @@ from tobias.plotting.plot_aggregate import * from tobias.plotting.plot_heatmap import * -from tobias.plotting.plot_bindetect import * from tobias.plotting.plot_changes import * from tobias.motifs.tfbscan import * from tobias.motifs.format_motifs import * +from tobias.motifs.cluster_tfbs import * +from tobias.motifs.score_bed import * -from tobias.utils.subsample_bam import * -from tobias.utils.merge_pdfs import * -from tobias.utils.score_bed import * +from tobias.misc.subsample_bam import * +from tobias.misc.merge_pdfs import * +from tobias.misc.maxpos import * -TOBIAS_VERSION = "0.1" + +TOBIAS_VERSION = "0.2" #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Change here :-) def main(): parser = argparse.ArgumentParser("TOBIAS", usage=SUPPRESS) @@ -98,14 +100,14 @@ def main(): formatmotifs_parser.set_defaults(func=run_formatmotifs) all_tool_parsers[name.lower()] = formatmotifs_parser - """ - name, hlp = "ClusterTF", "Cluster TFs based on overlap of sites" + + name, hlp = "ClusterTFBS", "Cluster TFs based on overlap of sites" parser.description += " {0}\t\t{1}\n".format(name, hlp) clustering_parser = subparsers.add_parser(name, usage=SUPPRESS) clustering_parser = add_clustering_arguments(clustering_parser) clustering_parser.set_defaults(func=run_clustering) - all_tool_parsers[name] = clustering_parser - """ + all_tool_parsers[name.lower()] = clustering_parser + name, hlp = "ScoreBed", "Score .bed-file with signal from .bigwig-file(s)" parser.description += " {0}\t\t{1}\n".format(name, hlp) @@ -136,15 +138,6 @@ def main(): heatmap_parser.set_defaults(func=run_heatmap) all_tool_parsers[name.lower()] = heatmap_parser - - name, hlp = "PlotBINDetect", "Plotting function from BINDetect (to re-plot output)" - parser.description += " {0}\t{1}\n".format(name, hlp) - diffplot_parser = subparsers.add_parser(name, usage=SUPPRESS) - diffplot_parser = add_diffplot_arguments(diffplot_parser) - diffplot_parser.set_defaults(func=run_diffplot) - all_tool_parsers[name.lower()] = diffplot_parser - - name, hlp = "PlotChanges", "Plot changes in TF binding across multiple conditions (from BINDetect output)" parser.description += " {0}\t\t{1}\n".format(name, hlp) changeplot_parser = subparsers.add_parser(name, usage=SUPPRESS) @@ -167,6 +160,13 @@ def main(): mergepdf_parser.set_defaults(func=run_mergepdf) all_tool_parsers[name.lower()] = mergepdf_parser + name, hlp = "MaxPos", "Get .bed-positions of highest bigwig signal within .bed-regions" + parser.description += " {0}\t\t{1}\n".format(name, hlp) + maxpos_parser = subparsers.add_parser(name, usage=SUPPRESS) + maxpos_parser = add_maxpos_arguments(maxpos_parser) + maxpos_parser.set_defaults(func=run_maxpos) + all_tool_parsers[name.lower()] = maxpos_parser + name, hlp = "SubsampleBam", "Subsample a .bam-file using samtools" parser.description += " {0}\t\t{1}\n".format(name, hlp) subsample_parser = subparsers.add_parser(name, usage=SUPPRESS) diff --git a/tobias/footprinting/ATACorrect.py b/tobias/footprinting/ATACorrect.py index cea5582..8a6ede7 100644 --- a/tobias/footprinting/ATACorrect.py +++ b/tobias/footprinting/ATACorrect.py @@ -43,6 +43,7 @@ from tobias.utils.regions import * from tobias.utils.sequences import * from tobias.utils.ngs import * +from tobias.utils.logger import * #np.seterr(divide='raise', invalid='raise') @@ -75,7 +76,7 @@ def add_atacorrect_arguments(parser): optargs.add_argument('--regions_out', metavar="", help="Output regions (default: peaks.bed)") optargs.add_argument('--blacklist', metavar="", help="Blacklisted regions in .bed-format (default: None)") #file containing blacklisted regions to be excluded from analysis") optargs.add_argument('--extend', metavar="", type=int, help="Extend output regions with basepairs upstream/downstream (default: 100)", default=100) - optargs.add_argument('--split_strands', help="Calculate and correct bias separately per strand", action="store_true") + optargs.add_argument('--split_strands', help="Write out tracks per strand", action="store_true") optargs.add_argument('--norm_off', help="Switches off normalization based on number of reads", action='store_true') optargs.add_argument('--track_off', metavar="", help="Switch off writing of individual .bigwig-tracks (uncorrected/bias/expected/corrected)", nargs="*", choices=["uncorrected", "bias", "expected", "corrected"], default=[]) @@ -85,18 +86,14 @@ def add_atacorrect_arguments(parser): optargs.add_argument('--bg_shift', metavar="", type=int, help="Read shift for estimation of background frequencies (default: 100)", default=100) optargs.add_argument('--window', metavar="", help="Window size for calculating expected signal (default: 100)", type=int, default=100) optargs.add_argument('--score_mat', metavar="", help="Type of matrix to use for bias estimation (PWM/DWM) (default: DWM)", choices=["PWM", "DWM"], default="DWM") - #optargs.add_argument('-m', '--method', metavar="", help="Method for correcting cutsites (subtract/log2fc) (default: subtract)", choices=["log2fc", "subtract"], default="subtract") runargs = parser.add_argument_group('Run arguments') runargs.add_argument('--prefix', metavar="", help="Prefix for output files (default: same as .bam file)") runargs.add_argument('--outdir', metavar="", help="Output directory for files (default: current working directory)", default="") - - ##shared across TOBIAS runargs.add_argument('--cores', metavar="", type=int, help="Number of cores to use for computation (default: 1)", default=1) runargs.add_argument('--split', metavar="", type=int, help="Split of multiprocessing jobs (default: 100)", default=100) - runargs.add_argument('--verbosity', metavar="", help="Level of output logging (1 (sparse) / 2 (normal) / 3 (debug)) (default: 2)", choices=[1,2,3], default=2, type=int) - runargs.add_argument('--log', metavar="", help="Full path of logfile (default: log is printed to stdout)") - runargs.add_argument('--debug', help=argparse.SUPPRESS, action='store_true') + + runargs = add_logger_args(runargs) return(parser) @@ -107,12 +104,10 @@ def add_atacorrect_arguments(parser): def run_atacorrect(args): """ - Function for bias correction + Function for bias correction of input .bam files Calls functions in ATACorrect_functions and several internal classes """ - begin_time = datetime.now() - #Test if required arguments were given: if args.bam == None: sys.exit("Error: No .bam-file given") @@ -122,14 +117,13 @@ def run_atacorrect(args): sys.exit("Error: No .peaks-file given") #Adjust input files to full path - args.bam = os.path.abspath(args.bam) - args.genome = os.path.abspath(args.genome) - args.peaks = os.path.abspath(args.peaks) if args.peaks != None else None + #args.bam = os.path.abspath(args.bam) + #args.genome = os.path.abspath(args.genome) + #args.peaks = os.path.abspath(args.peaks) if args.peaks != None else None #Adjust some parameters depending on input args.prefix = os.path.splitext(os.path.basename(args.bam))[0] if args.prefix == None else args.prefix args.outdir = os.path.abspath(args.outdir) if args.outdir != None else os.path.abspath(os.getcwd()) - args.log = os.path.abspath(args.log) if args.log != None else None #Set output bigwigs based on input tracks = ["uncorrected", "bias", "expected", "corrected"] @@ -143,58 +137,52 @@ def run_atacorrect(args): output_bws = {} for track in tracks: output_bws[track] = {} - for strand in strands + ["both"]: + for strand in strands: elements = [args.prefix, track] if strand == "both" else [args.prefix, track, strand] output_bws[track][strand] = {"fn": os.path.join(args.outdir, "{0}.bw".format("_".join(elements)))} #Set all output files bam_out = os.path.join(args.outdir, args.prefix + "_atacorrect.bam") - bigwigs = [output_bws[track][strand]["fn"] for (track, strand) in itertools.product(tracks, strands + ["both"])] + bigwigs = [output_bws[track][strand]["fn"] for (track, strand) in itertools.product(tracks, strands)] figures_f = os.path.join(args.outdir, "{0}_atacorrect.pdf".format(args.prefix)) - log_f = args.log - output_files = bigwigs + [figures_f, log_f] + output_files = bigwigs + [figures_f] output_files = list(OrderedDict.fromkeys(output_files)) #remove duplicates due to "both" option + strands = ["forward", "reverse"] #----------------------------------------------------------------------------------------------------# # Print info on run #----------------------------------------------------------------------------------------------------# - logger = create_logger(args.verbosity, args.log) - - logger.comment("#TOBIAS ATACorrect (run started {0})\n".format(begin_time)) - logger.comment("#Command line call: {0}\n".format(" ".join(sys.argv))) + logger = TobiasLogger("ATACorrect", args.verbosity) + logger.begin() parser = add_atacorrect_arguments(argparse.ArgumentParser()) - logger.comment(arguments_overview(parser, args)) - - logger.comment("# ----- Output files -----") - for outf in output_files: - if outf != None: - logger.comment("# {0}".format(outf)) - logger.comment("\n\n") - + logger.arguments_overview(parser, args) + logger.output_files(output_files) #----------------------------------------------------------------------------------------------------# # Test input file availability for reading #----------------------------------------------------------------------------------------------------# - logger.critical("----- Processing input data -----") + logger.info("----- Processing input data -----") + + #Todo: use TOBIAS functions #Input test - logger.info("Testing input file availability") + logger.debug("Testing input file availability") file_list = [args.bam, args.genome, args.peaks] file_list = [file for file in file_list if file != None] #some files can be None depending on choice for path in file_list: if not os.path.exists(path): - logger.info("\nError: {0} does not exist.".format(path)) + logger.error("\nError: {0} does not exist.".format(path)) sys.exit(1) - logger.info("Testing output directory/file writeability") + logger.debug("Testing output directory/file writeability") make_directory(args.outdir) if not os.access(args.outdir, os.W_OK): - logger.info("Error: {0} does not exist or is not writeable.".format(args.outdir)) + logger.error("Error: {0} does not exist or is not writeable.".format(args.outdir)) sys.exit(1) #Output write test @@ -203,13 +191,12 @@ def run_atacorrect(args): continue if os.path.exists(path): if not os.access(path, os.W_OK): - logger.info("Error: {0} could not be opened for writing.".format(path)) + logger.error("Error: {0} could not be opened for writing.".format(path)) sys.exit(1) #Open pdf for figures figure_pdf = PdfPages(figures_f, keep_empty=True) - #----------------------------------------------------------------------------------------------------# # Read information in bam/fasta #----------------------------------------------------------------------------------------------------# @@ -217,8 +204,8 @@ def run_atacorrect(args): logger.info("Reading info from .bam file") bamfile = pysam.AlignmentFile(args.bam, "rb") if bamfile.has_index() == False: - logger.info("ERROR: No index found for bamfile") - sys.exit() + logger.warning("No index found for bamfile - creating one via pysam.") + pysam.index(args.bam) bam_references = bamfile.references #chromosomes in correct order bam_chrom_info = dict(zip(bamfile.references, bamfile.lengths)) @@ -235,8 +222,8 @@ def run_atacorrect(args): bamlen = bam_chrom_info[chrom] fastalen = fasta_chrom_info[chrom] if bamlen != fastalen: - logger.critical("(Fastafile)\t{0} has length {1}".format(chrom, fasta_chrom_info[chrom])) - logger.critical("(Bamfile)\t{0} has length {1}".format(chrom, bam_chrom_info[chrom])) + logger.warning("(Fastafile)\t{0} has length {1}".format(chrom, fasta_chrom_info[chrom])) + logger.warning("(Bamfile)\t{0} has length {1}".format(chrom, bam_chrom_info[chrom])) sys.exit("Error: .bam and .fasta have different chromosome lengths. Please make sure the genome file is similar to the one used in mapping.") @@ -278,8 +265,8 @@ def run_atacorrect(args): #Remove blacklisted regions and chromosomes not in common blacklist_regions = RegionList().from_bed(args.blacklist) if args.blacklist != None else RegionList([]) #fill in with regions from args.blacklist - regions_dict = {"input_regions":input_regions, "output_regions":output_regions, "peak_regions":peak_regions, "nonpeak_regions":nonpeak_regions} - for sub in regions_dict: + regions_dict = {"genome": genome_regions, "input_regions":input_regions, "output_regions":output_regions, "peak_regions":peak_regions, "nonpeak_regions":nonpeak_regions, "blacklist_regions": blacklist_regions} + for sub in ["input_regions", "output_regions", "peak_regions", "nonpeak_regions"]: regions_sub = regions_dict[sub] regions_sub.subtract(blacklist_regions) regions_sub = regions_sub.apply_method(OneRegion.split_region, 50000) @@ -287,12 +274,6 @@ def run_atacorrect(args): regions_sub.keep_chroms(chrom_in_common) regions_dict[sub] = regions_sub - input_regions = regions_dict["input_regions"] - output_regions = regions_dict["output_regions"] - peak_regions = regions_dict["peak_regions"] - nonpeak_regions = regions_dict["nonpeak_regions"] - - #write beds to look at in igv #input_regions.write_bed(os.path.join(args.outdir, "input_regions.bed")) #output_regions.write_bed(os.path.join(args.outdir, "output_regions.bed")) @@ -304,19 +285,17 @@ def run_atacorrect(args): chrom_order = {bam_references[i]:i for i in range(len(bam_references))} #for use later when sorting output #### Statistics about regions #### - genome_bp = sum([region.get_length() for region in genome_regions]) - blacklist_bp = sum([region.get_length() for region in blacklist_regions]) - peak_bp = sum([region.get_length() for region in peak_regions]) - nonpeak_bp = sum([region.get_length() for region in nonpeak_regions]) - input_bp = sum([region.get_length() for region in input_regions]) - output_bp = sum([region.get_length() for region in output_regions]) - - logger.info("INFO\tGENOME\t{0} ({1:.2f}%)".format(genome_bp, genome_bp/genome_bp*100)) - logger.info("INFO\tBLACKLIST_REGIONS\t{0} ({1:.2f}%)".format(blacklist_bp, blacklist_bp/genome_bp*100)) - logger.info("INFO\tPEAK_REGIONS\t{0} ({1:.2f}%)".format(peak_bp, peak_bp/genome_bp*100)) - logger.info("INFO\tNONPEAK_REGIONS\t{0} ({1:.2f}%)".format(nonpeak_bp, nonpeak_bp/genome_bp*100)) - logger.info("INFO\tINPUT_REGIONS\t{0} ({1:.2f}%)".format(input_bp, input_bp/genome_bp*100)) - logger.info("INFO\tOUTPUT_REGIONS\t{0} ({1:.2f}%)".format(output_bp, output_bp/genome_bp*100)) + genome_bp = sum([region.get_length() for region in regions_dict["genome"]]) + for key in regions_dict: + + total_bp = sum([region.get_length() for region in regions_dict[key]]) + logger.stats("{0}: {1} regions | {2} bp | {3:.2f}% coverage".format(key, len(regions_dict[key]), total_bp, total_bp/genome_bp*100)) + + #Estallish variables for regions to be used + input_regions = regions_dict["input_regions"] + output_regions = regions_dict["output_regions"] + peak_regions = regions_dict["peak_regions"] + nonpeak_regions = regions_dict["nonpeak_regions"] #----------------------------------------------------------------------------------------------------# # Estimate normalization factors @@ -324,55 +303,51 @@ def run_atacorrect(args): #Setup logger queue logger.debug("Setting up listener for log") - log_q = mp.Manager().Queue() - args.log_q = log_q - listener = mp.Process(target=main_logger_process, args=(log_q, logger)) - listener.start() + logger.start_logger_queue() + args.log_q = logger.queue #----------------------------------------------------------------------------------------------------# logger.comment("") - logger.critical("----- Estimating normalization factors -----") + logger.info("----- Estimating normalization factors -----") #If normalization is to be calculated if not args.norm_off: #Reads in peaks/nonpeaks - logger.info("Counting reads in peak regions...") + logger.info("Counting reads in peak regions") peak_region_chunks = peak_regions.chunks(args.split) reads_peaks = sum(run_parallel(count_reads, peak_region_chunks, [args], args.cores, logger)) logger.comment("") - logger.info("Counting reads in nonpeak regions...") + logger.info("Counting reads in nonpeak regions") nonpeak_region_chunks = nonpeak_regions.chunks(args.split) reads_nonpeaks = sum(run_parallel(count_reads, nonpeak_region_chunks, [args], args.cores, logger)) - total_reads = reads_peaks + reads_nonpeaks - - logger.comment("") - logger.info("INFO\tTOTAL_READS\t{0}".format(total_reads)) - logger.info("INFO\tPEAK_READS\t{0}".format(reads_peaks)) - logger.info("INFO\tNONPEAK_READS\t{0}".format(reads_nonpeaks)) + reads_total = reads_peaks + reads_nonpeaks - lib_norm = 10000000/total_reads - noise_norm = reads_nonpeaks/reads_peaks - correct_factor = lib_norm*noise_norm + logger.stats("TOTAL_READS\t{0}".format(reads_total)) + logger.stats("PEAK_READS\t{0}".format(reads_peaks)) + logger.stats("NONPEAK_READS\t{0}".format(reads_nonpeaks)) - logger.info("INFO\tLIB_NORM\t{0:.5f}".format(lib_norm)) - logger.info("INFO\tNOISE_NORM\t{0:.5f}".format(noise_norm)) + lib_norm = 10000000/reads_total + frip = reads_peaks/reads_total + correct_factor = lib_norm*(1/frip) + logger.stats("LIB_NORM\t{0:.5f}".format(lib_norm)) + logger.stats("FRiP\t{0:.5f}".format(frip)) else: logger.info("Normalization was switched off") correct_factor = 1.0 - logger.info("INFO\tCORRECTION_FACTOR:\t{0:.5f}".format(correct_factor)) + logger.stats("CORRECTION_FACTOR:\t{0:.5f}".format(correct_factor)) #----------------------------------------------------------------------------------------------------# # Estimate sequence bias #----------------------------------------------------------------------------------------------------# logger.comment("") - logger.critical("Started estimation of sequence bias...") + logger.info("Started estimation of sequence bias...") input_region_chunks = input_regions.chunks(args.split) #split to 100 chunks (also decides the step of output) out_lst = run_parallel(bias_estimation, input_region_chunks, [args], args.cores, logger) #Output is list of AtacBias objects @@ -402,120 +377,90 @@ def run_atacorrect(args): #----------------------------------------------------------------------------------------------------# logger.comment("") - logger.critical("Correcting reads from .bam within output regions") - - L = 2 * args.k_flank + 1 + logger.info("----- Correcting reads from .bam within output regions -----") - #Getting bigwig files ready - header = [(chrom, bam_chrom_info[chrom]) for chrom in bam_references] + output_regions.loc_sort(bam_references) #sort in order of references + output_regions_chunks = output_regions.chunks(args.split) + no_tasks = float(len(output_regions_chunks)) + chunk_sizes = [len(chunk) for chunk in output_regions_chunks] + logger.debug("All regions chunked: {0} ({1})".format(len(output_regions), chunk_sizes)) + ### Create key-file linking for bigwigs + key2file = {} for track in output_bws: for strand in output_bws[track]: filename = output_bws[track][strand]["fn"] - output_bws[track][strand]["pybw"] = pyBigWig.open(filename, "w") - output_bws[track][strand]["pybw"].addHeader(header) + key = "{}:{}".format(track, strand) + key2file[key] = filename - pre_bias = {} - post_bias = {} - for direction in strands: - pre_bias[direction] = SequenceMatrix.create(L, "PWM") - post_bias[direction] = SequenceMatrix.create(L, "PWM") + #Start correction/write cores + n_bigwig = len(key2file.values()) + writer_cores = min(n_bigwig, max(1,int(args.cores*0.1))) #at most one core per bigwig or 10% of cores (or 1) + worker_cores = max(1, args.cores - writer_cores) + logger.debug("Worker cores: {0}".format(worker_cores)) + logger.debug("Writer cores: {0}".format(writer_cores)) - output_regions_chunks = output_regions.chunks(args.split) - no_tasks = float(len(output_regions_chunks)) - - #Start correction - pool = mp.Pool(processes=args.cores) - task_list = [pool.apply_async(bias_correction, args=[chunk, args, bias_obj]) for chunk in output_regions_chunks] - pool.close() - - #Process results as they come in - write_idx = 0 #index in task_list - prev_progress = (-1, -1) - while write_idx < len(task_list): - - if task_list[write_idx].ready() == True: - - result = task_list[write_idx].get() - signals = result[0] - pre_bias_chunk = result[1][0] - post_bias_chunk = result[1][1] - - for direction in strands: - pre_bias[direction].add_counts(pre_bias_chunk[direction]) - post_bias[direction].add_counts(post_bias_chunk[direction]) - - #Write tracks to bigwig file - for region in sorted(signals.keys(), key=lambda tup: (chrom_order[tup[0]], tup[1], tup[2])): #Ensure that positions are written to bigwig in correct order - - chrom, reg_start, reg_end = region - positions = np.arange(reg_start, reg_end) #genomic 0-based positions - - for track in tracks: # only prints chosen tracks - - #Join signals if forward/reverse split - if args.split_strands: - signals[region][track]["both"] = signals[region][track]["forward"] + signals[region][track]["reverse"] - - strands_found = signals[region][track] - for strand in strands_found: - - signal = np.copy(signals[region][track][strand]) #Numpy array of signal - - signal[np.isclose(signal, 0)] = 0 #adjust for weird numpy floating point - included = signal.nonzero()[0] - pos = positions[included] - val = signal[included] - - if len(pos) > 0: - try: - output_bws[track][strand]["pybw"].addEntries(chrom, pos, values=val, span=1) - except: + worker_pool = mp.Pool(processes=worker_cores) + writer_pool = mp.Pool(processes=writer_cores) + manager = mp.Manager() - logger.info("Error writing region {0}.".format(region)) - print("TRACK: {0}".format(track)) - print("STRAND: {0}".format(strand)) - print("SIGNAL: {0}".format(signals[region][track][strand])) - sys.exit() - - #Clear memory - del result - del signal - gc.collect() - - #Wait for next chunk to print - write_idx += 1 #This is also the same as the number of prints done - - tasks_done = sum([task.ready() for task in task_list]) - if tasks_done != prev_progress[0] or write_idx != prev_progress[1]: - logger.info("Correction progress: {0:.0f}% | Writing progress: {1:.0f}%".format(tasks_done/no_tasks*100, write_idx/no_tasks*100)) - - prev_progress = (tasks_done, write_idx) - - #Done computing - pool.join() - gc.collect() + #Start bigwig file writers + header = [(chrom, bam_chrom_info[chrom]) for chrom in bam_references] + key_chunks = [list(key2file.keys())[i::writer_cores] for i in range(writer_cores)] + qs_list = [] + qs = {} + for chunk in key_chunks: + logger.debug("Creating writer queue for {0}".format(chunk)) - #Done writing to files - logger.info("Closing bigwig files (this can take a while)") - for track in output_bws: - for strand in output_bws[track]: - output_bws[track][strand]["pybw"].close() + q = manager.Queue() + qs_list.append(q) + files = [key2file[key] for key in chunk] + writer_pool.apply_async(bigwig_writer, args=(q, dict(zip(chunk, files)), header, output_regions, args)) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x))) + for key in chunk: + qs[key] = q + args.qs = qs + writer_pool.close() #no more jobs applied to writer_pool - #---------------------------------------------------# - - logger.debug("Waiting for listener to finish") - log_q.put(None) - while listener.exitcode != 0: - logger.debug("Listener exitcode is: {0}".format(listener.exitcode)) - time.sleep(1) - - logger.debug("Joining listener") - listener.join() + #Start correction + logger.debug("Starting correction") + task_list = [worker_pool.apply_async(bias_correction, args=[chunk, args, bias_obj]) for chunk in output_regions_chunks] + worker_pool.close() + monitor_progress(task_list, logger, "Correction progress:") #does not exit until tasks in task_list finished + results = [task.get() for task in task_list] + + #Get all results + pre_bias = results[0][0] #initialize with first result + post_bias = results[0][1] #initialize with first result + for result in results[1:]: + pre_bias_chunk = result[0] + post_bias_chunk = result[1] + + for direction in strands: + pre_bias[direction].add_counts(pre_bias_chunk[direction]) + post_bias[direction].add_counts(post_bias_chunk[direction]) + + #Stop all queues for writing + logger.debug("Stop all queues by inserting None") + for q in qs_list: + q.put((None, None, None)) + + logger.debug("Joining bigwig_writer queues") + qsum = sum([q.qsize() for q in qs_list]) + while qsum != 0: + qsum = sum([q.qsize() for q in qs_list]) + logger.spam("- Queue sizes {0}".format([(key, qs[key].qsize()) for key in qs])) + time.sleep(0.5) + + #Waits until all queues are closed + writer_pool.join() + worker_pool.terminate() + worker_pool.join() + #Stop multiprocessing logger + logger.stop_logger_queue() #----------------------------------------------------------------------------------------------------# # Information and verification of corrected read frequencies @@ -527,9 +472,11 @@ def run_atacorrect(args): #Calculating variance per base for strand in strands: - #Join negative/positive counts from post-correction bias + #Invert negative counts abssum = np.abs(np.sum(post_bias[strand].neg_counts, axis=0)) - post_bias[strand].neg_counts = abssum + post_bias[strand].neg_counts + post_bias[strand].neg_counts = post_bias[strand].neg_counts + abssum + + #Join negative/positive counts post_bias[strand].counts += post_bias[strand].neg_counts #now pos pre_bias[strand].prepare_mat() @@ -537,8 +484,8 @@ def run_atacorrect(args): pre_var = np.mean(np.var(pre_bias[strand].bias_pwm, axis=1)[:4]) #mean of variance per nucleotide post_var = np.mean(np.var(post_bias[strand].bias_pwm, axis=1)[:4]) - logger.info("INFO\tpre-bias variance {0}:\t{1:.7f}".format(strand, pre_var)) - logger.info("INFO\tpost-bias variance {0}:\t{1:.7f}".format(strand, post_var)) + logger.stats("BIAS\tpre-bias variance {0}:\t{1:.7f}".format(strand, pre_var)) + logger.stats("BIAS\tpost-bias variance {0}:\t{1:.7f}".format(strand, post_var)) #Plot figure fig_title = "Nucleotide frequencies in corrected reads\n({0} strand)".format(strand) @@ -550,10 +497,7 @@ def run_atacorrect(args): #----------------------------------------------------------------------------------------------------# figure_pdf.close() - end_time = datetime.now() - logger.comment("") - logger.critical("Finished ATACorrect run (total time of {0})".format(end_time - begin_time)) - + logger.end() #--------------------------------------------------------------------------------------------------------# if __name__ == '__main__': diff --git a/tobias/footprinting/ATACorrect_functions.py b/tobias/footprinting/ATACorrect_functions.py index e5ebd99..5f7bb3f 100644 --- a/tobias/footprinting/ATACorrect_functions.py +++ b/tobias/footprinting/ATACorrect_functions.py @@ -29,6 +29,7 @@ from tobias.utils.signals import * from tobias.utils.ngs import * from tobias.utils.utilities import * +from tobias.utils.logger import * #Catch warnings from curve_fit import warnings @@ -56,53 +57,8 @@ def join(self, obj): self.no_reads += obj.no_reads -#################################################################################################### -#################################################################################################### - -def run_parallel(FUNC, input_chunks, arguments, n_cores, logger): - """ - #FUNC is the function to run - #input_chunks is the input to loop over - #arguments are arguments to func - #logger is a Logging.Logger object - """ - - no_chunks = len(input_chunks) - - if n_cores > 1: - - #Send jobs to pool - pool = mp.Pool(processes=n_cores) - task_list = [] - for input_chunk in input_chunks: - task_list.append(pool.apply_async(FUNC, args=[input_chunk] + arguments)) - pool.close() #done sending jobs to pool - - #Wait for tasks to finish - count = -1 - finished = sum([task.ready() for task in task_list]) - while finished < no_chunks: - finished = sum([task.ready() for task in task_list]) - if count != finished: - logger.info("Progress: {0:.0f}%".format(finished/float(no_chunks)*100)) - count = finished - else: - time.sleep(0.5) - pool.join() - - #Get results from processes - output_list = [task.get() for task in task_list] - - else: - - output_list = [] - for count, input_chunk in enumerate(input_chunks): - logger.info("Progress: {0:.0f}%".format(count/float(no_chunks)*100)) - output_list.append(FUNC(input_chunk, *arguments)) - - return(output_list) - #--------------------------------------------------------------------------------------------------# + def count_reads(regions_list, params): """ Count reads from bam within regions (counts position of cutsite to prevent double-counting) """ @@ -111,11 +67,11 @@ def count_reads(regions_list, params): bam_obj = pysam.AlignmentFile(bam_f, "rb") log_q = params.log_q - logger = create_mp_logger(params.verbosity, log_q) #sending all logger calls to log_q + logger = TobiasLogger("", params.verbosity, log_q) #sending all logger calls to log_q #Count per region read_count = 0 - logger.debug("Started counting region_chunk ({0} -> {1})".format("_".join([str(element) for element in regions_list[0]]), "_".join([str(element) for element in regions_list[-1]]))) + logger.spam("Started counting region_chunk ({0} -> {1})".format("_".join([str(element) for element in regions_list[0]]), "_".join([str(element) for element in regions_list[-1]]))) for region in regions_list: read_lst = ReadList().from_bam(bam_obj, region) @@ -124,7 +80,7 @@ def count_reads(regions_list, params): if read.cutsite > region.start and read.cutsite < region.end: #only reads within borders read_count += 1 - logger.debug("Finished counting region_chunk ({0} -> {1})".format("_".join([str(element) for element in regions_list[0]]), "_".join([str(element) for element in regions_list[-1]]))) + logger.spam("Finished counting region_chunk ({0} -> {1})".format("_".join([str(element) for element in regions_list[0]]), "_".join([str(element) for element in regions_list[-1]]))) bam_obj.close() return(read_count) @@ -141,8 +97,8 @@ def bias_estimation(regions_list, params): bg_shift = params.bg_shift read_shift = params.read_shift L = 2 * k_flank + 1 - log_q = params.log_q - logger = create_mp_logger(params.verbosity, log_q) #sending all logger calls to log_q + + logger = TobiasLogger("", params.verbosity, params.log_q) #sending all logger calls to log_q #Open objects for reading bam_obj = pysam.AlignmentFile(bam_f, "rb") @@ -151,12 +107,12 @@ def bias_estimation(regions_list, params): bias_obj = AtacBias(L, params.score_mat) - strands = ["forward", "reverse"] if params.split_strands else ["both"] + strands = ["forward", "reverse"] #Estimate bias at each region for region in regions_list: - read_lst = ReadList().from_bam(bam_obj, region) #fragment_lst.to_read_list() #Fragments to single read list + read_lst = ReadList().from_bam(bam_obj, region) for read in read_lst: read.get_cutsite(read_shift) @@ -170,7 +126,7 @@ def bias_estimation(regions_list, params): #Split reads forward/reverse for_lst, rev_lst = read_lst.split_strands() - read_lst_strand = {"both": read_lst, "forward": for_lst, "reverse": rev_lst} + read_lst_strand = {"forward": for_lst, "reverse": rev_lst} for strand in strands: @@ -179,11 +135,6 @@ def bias_estimation(regions_list, params): for read in read_lst_strand[strand]: read_per_pos[read.cutsite] = read_per_pos.get(read.cutsite, []) + [read] - #Set all reads to forward if "both" - if strand == "both": - for cutsite in read_per_pos: - read_per_pos[cutsite][0].is_reverse = False - for cutsite in read_per_pos: if cutsite > region.start and cutsite < region.end: #only reads within borders read = read_per_pos[cutsite][0] #use first read in list to establish kmer @@ -192,7 +143,7 @@ def bias_estimation(regions_list, params): read.get_kmer(sequence_obj, k_flank) bias_obj.bias[strand].add_sequence(read.kmer, no_cut) - read.shift_cutsite(bg_shift) + read.shift_cutsite(-bg_shift) #upstream of read; ensures that bg is not within fragment read.get_kmer(sequence_obj, k_flank) bias_obj.bias[strand].add_background(read.kmer, no_cut) @@ -205,17 +156,16 @@ def bias_estimation(regions_list, params): #--------------------------------------------------------------------------------------------------# def relu(x, a, b): + """ a and b are components of a linear curve (y=a*x+b) """ y = np.maximum(0.0, a*x + b) return(y) -def x_thresh(a,b): - x = (0-b)/a #x position of y=0 intersect - return(x) - #--------------------------------------------------------------------------------------------------# def bias_correction(regions_list, params, bias_obj): """ Corrects bias in cutsites (from bamfile) using estimated bias """ + logger = TobiasLogger("", params.verbosity, params.log_q) + bam_f = params.bam fasta_f = params.genome k_flank = params.k_flank @@ -223,11 +173,11 @@ def bias_correction(regions_list, params, bias_obj): L = 2 * k_flank + 1 w = params.window f = int(w/2.0) + qs = params.qs f_extend = k_flank + f - strands = ["forward", "reverse"] if params.split_strands else ["both"] - + strands = ["forward", "reverse"] pre_bias = {strand: SequenceMatrix.create(L, "PWM") for strand in strands} post_bias = {strand: SequenceMatrix.create(L, "PWM") for strand in strands} @@ -247,6 +197,7 @@ def bias_correction(regions_list, params, bias_obj): reg_key = (region_obj.chrom, region_obj.start+f_extend, region_obj.end-f_extend) #output region out_signals[reg_key] = {"uncorrected":{}, "bias":{}, "expected":{}, "corrected":{}} + ################################ ####### Uncorrected reads ###### ################################ @@ -259,16 +210,12 @@ def bias_correction(regions_list, params, bias_obj): #Exclude reads with cutsites outside region read_lst = ReadList([read for read in read_lst if read.cutsite > region_obj.start and read.cutsite < region_obj.end]) for_lst, rev_lst = read_lst.split_strands() - read_lst_strand = {"both":read_lst, "forward": for_lst, "reverse": rev_lst} + read_lst_strand = {"forward": for_lst, "reverse": rev_lst} for strand in strands: out_signals[reg_key]["uncorrected"][strand] = read_lst_strand[strand].signal(region_obj) - out_signals[reg_key]["uncorrected"][strand] *= bias_obj.correction_factor #Correction factor - #out_signals[reg_key]["uncorrected"][strand] = np.log2(out_signals[reg_key]["uncorrected"][strand] + 1) + out_signals[reg_key]["uncorrected"][strand] = np.round(out_signals[reg_key]["uncorrected"][strand], 5) - - del read_lst_strand - del for_lst ################################ ###### Estimation of bias ###### @@ -279,7 +226,7 @@ def bias_correction(regions_list, params, bias_obj): #Score sequence using forward/reverse motifs for strand in strands: - if strand == "forward" or strand == "both": + if strand == "forward": seq = sequence_obj.sequence bias = bias_obj.bias[strand].score_sequence(seq) elif strand == "reverse": @@ -287,7 +234,7 @@ def bias_correction(regions_list, params, bias_obj): bias = bias_obj.bias[strand].score_sequence(seq)[::-1] #3'-5' out_signals[reg_key]["bias"][strand] = np.nan_to_num(bias) #convert any nans to 0 - + ################################# ###### Correction of reads ###### @@ -351,52 +298,67 @@ def bias_correction(regions_list, params, bias_obj): out_signals[reg_key]["expected"][strand] = signal_sum * bias_probas ######## Correct signal ######## + out_signals[reg_key]["uncorrected"][strand] *= bias_obj.correction_factor + out_signals[reg_key]["expected"][strand] *= bias_obj.correction_factor out_signals[reg_key]["corrected"][strand] = out_signals[reg_key]["uncorrected"][strand] - out_signals[reg_key]["expected"][strand] - ##### Rescale back to original sum ##### - pos_corrected = np.abs(out_signals[reg_key]["corrected"][strand]) - corrected_sum = fast_rolling_math(pos_corrected, w, "sum") - corrected_sum[np.isclose(corrected_sum, 0)] = np.nan - - frac = signal_sum / corrected_sum #If corrected sum was smaller than signal -> multiply - frac[np.isnan(frac)] = 1 - - out_signals[reg_key]["corrected"][strand] = out_signals[reg_key]["corrected"][strand] * frac - - ####################################### ######## Verify correction ######## ####################################### #Verify correction across all reads - for direction in strands: + for strand in strands: for idx in range(k_flank,reg_len - k_flank -1): if idx > k_flank and idx < reg_len-k_flank: - orig = out_signals[reg_key]["uncorrected"][direction][idx] - correct = out_signals[reg_key]["corrected"][direction][idx] + orig = out_signals[reg_key]["uncorrected"][strand][idx] + correct = out_signals[reg_key]["corrected"][strand][idx] - if direction == "forward" or direction == "both": - kmer = sequence_obj.sequence[idx-k_flank:idx+k_flank+1] - else: - kmer = sequence_obj.revcomp[reg_len-idx-k_flank-1:reg_len-idx+k_flank] + if orig != 0 or correct != 0: #if both are 0, don't add to pre/post bias + if strand == "forward": + kmer = sequence_obj.sequence[idx-k_flank:idx+k_flank+1] + else: + kmer = sequence_obj.revcomp[reg_len-idx-k_flank-1:reg_len-idx+k_flank] - #Save kmer for bias correction verification - pre_bias[direction].add_sequence(kmer, orig) - post_bias[direction].add_sequence(kmer, correct) + #Save kmer for bias correction verification + pre_bias[strand].add_sequence(kmer, orig) + post_bias[strand].add_sequence(kmer, correct) + + + ####################################### + ######## Write to queue ######### + ####################################### #Set size back to original for track in out_signals[reg_key]: - for direction in out_signals[reg_key][track]: - out_signals[reg_key][track][direction] = out_signals[reg_key][track][direction][f_extend:-f_extend] + for strand in out_signals[reg_key][track]: + out_signals[reg_key][track][strand] = out_signals[reg_key][track][strand][f_extend:-f_extend] + + #Calculate "both" if split_strands == False + if params.split_strands == False: + for track in out_signals[reg_key]: + out_signals[reg_key][track]["both"] = out_signals[reg_key][track]["forward"] + out_signals[reg_key][track]["reverse"] + + #Send to queue + strands_to_write = ["forward", "reverse"] if params.split_strands == True else ["both"] + for track in out_signals[reg_key]: + + #Send to writer per strand + for strand in strands_to_write: + key = "{0}:{1}".format(track, strand) + logger.spam("Sending {0} signal from region {1} to writer queue".format(key, reg_key)) + qs[key].put((key, reg_key, out_signals[reg_key][track][strand])) + + #Sent to qs - delete from this process + out_signals[reg_key] = None bam_obj.close() fasta_obj.close() gc.collect() - return(out_signals, [pre_bias, post_bias]) + return([pre_bias, post_bias]) diff --git a/tobias/footprinting/BINDetect.py b/tobias/footprinting/BINDetect.py index 65f8d00..722cb20 100644 --- a/tobias/footprinting/BINDetect.py +++ b/tobias/footprinting/BINDetect.py @@ -24,8 +24,6 @@ #Machine learning import sklearn from sklearn import mixture -#from sklearn.neighbors import KernelDensity -#from sklearn.model_selection import GridSearchCV import scipy from scipy.optimize import curve_fit @@ -44,7 +42,8 @@ from tobias.utils.regions import * from tobias.utils.sequences import * from tobias.utils.motifs import * -from tobias.plotting.plot_bindetect import * +from tobias.utils.logger import * +#from tobias.plotting.plot_bindetect import * #np.seterr(divide = 'ignore') @@ -63,7 +62,7 @@ def add_bindetect_arguments(parser): description += "The underlying method is a modified motif enrichment test to see which motifs have the largest differences in signal across input conditions. " description += "The output is an in-depth overview of global changes as well as the individual binding site signal-differences.\n\n" description += "Usage:\nTOBIAS BINDetect --signals ( (...)) --motifs --genome --peaks \n\n" - description += "Output files:\n- /bindetect_figures.pdf\n- /bindetect_results.{txt,xlsx}\n- /TF_distance_matrix.txt\n" + description += "Output files:\n- /_figures.pdf\n- /_results.{txt,xlsx}\n- /_distances.txt\n" description += "- //_overview.{txt,xlsx} (per motif)\n- //beds/_all.bed (per motif)\n" description += "- //beds/__bound.bed (per motif-condition pair)\n- //beds/__unbound.bed (per motif-condition pair)\n\n" parser.description = format_help_description("BINDetect", description) @@ -79,20 +78,21 @@ def add_bindetect_arguments(parser): optargs = parser.add_argument_group('Optional arguments') optargs.add_argument('--cond_names', metavar="", nargs="*", help="Names of conditions fitting to --signals (default: prefix of --signals)") optargs.add_argument('--peak_header', metavar="", help="File containing the header of --peaks separated by whitespace or newlines (default: peak columns are named \"_additional_\")") - optargs.add_argument('--naming', metavar="", help="Naming convention for TFs ('id', 'name', 'name_id', 'id_name') (default: 'name_id')", choices=["id", "name", "name_id", "id_name"], default="name_id") + #optargs.add_argument('--naming', metavar="", help="Naming convention for TFs ('id', 'name', 'name_id', 'id_name') (default: 'name_id')", choices=["id", "name", "name_id", "id_name"], default="name_id") optargs.add_argument('--motif_pvalue', metavar="", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for motif scanning (default: 1e-4)", default=0.0001) - optargs.add_argument('--bound_threshold', metavar="", type=lambda x: restricted_float(x, 0, 1), help="Float between 0 (strict) -> 1 (loose) indicating strictness of bound/unbound split (default: 0.4)", default=0.4) + optargs.add_argument('--bound_pvalue', metavar="", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001) optargs.add_argument('--pseudo', type=float, metavar="", help="Pseudocount for calculating log2fcs (default: estimated from data)", default=None) - #optargs.add_argument('--no_overwrite', help=argparse.SUPPRESS, action='store_true') #when set, already calculated all-files are not overwritten + optargs.add_argument('--time_series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.") runargs = parser.add_argument_group("Run arguments") runargs.add_argument('--outdir', metavar="", help="Output directory to place TFBS/plots in (default: bindetect_output)", default="bindetect_output") + optargs.add_argument('--prefix', metavar="", help="Prefix for overview files in --outdir folder (default: bindetect)", default="bindetect") runargs.add_argument('--cores', metavar="", type=int, help="Number of cores to use for computation (default: 1)", default=1) runargs.add_argument('--split', metavar="", type=int, help="Split of multiprocessing jobs (default: 100)", default=100) - runargs.add_argument('--verbosity', metavar="", type=int, help="Level of output logging (1 (sparse) / 2 (normal) / 3 (debug)) (default: 2)", choices=[1,2,3], default=2) - runargs.add_argument('--log', metavar="", help="Full path of logfile (default: log is printed to stdout)") runargs.add_argument('--debug', help=argparse.SUPPRESS, action='store_true') + runargs = add_logger_args(runargs) + return(parser) @@ -100,16 +100,19 @@ def find_nearest_idx(array, value): idx = (np.abs(array - value)).argmin() return idx +def norm_fit(x, mean, std, scale): + return(scale * scipy.stats.norm.pdf(x, mean, std)) + + #----------------------------------------------------------------------------------------------------------------# def run_bindetect(args): """ Main function to run bindetect algorithm with input files and parameters given in args """ - begin_time = datetime.now() - #Checking input and setting cond_names check_required(args, ["signals", "motifs", "genome", "peaks"]) args.cond_names = [os.path.basename(os.path.splitext(bw)[0]) for bw in args.signals] if args.cond_names is None else args.cond_names args.outdir = os.path.abspath(args.outdir) + args.naming = "name_id" #Set output files states = ["bound", "unbound"] @@ -119,48 +122,38 @@ def run_bindetect(args): outfiles.append(os.path.abspath(os.path.join(args.outdir, "*", "*_overview.txt"))) outfiles.append(os.path.abspath(os.path.join(args.outdir, "*", "*_overview.xlsx"))) - outfiles.append(os.path.abspath(os.path.join(args.outdir, "TF_distance_matrix.txt"))) + outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_distances.txt"))) #outfiles.append(os.path.abspath(os.path.join(args.outdir, "TF_clusters.txt"))) - outfiles.append(os.path.abspath(os.path.join(args.outdir, "bindetect_results.txt"))) - outfiles.append(os.path.abspath(os.path.join(args.outdir, "bindetect_results.xlsx"))) - outfiles.append(os.path.abspath(os.path.join(args.outdir, "bindetect_figures.pdf"))) + outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_results.txt"))) + outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_results.xlsx"))) + outfiles.append(os.path.abspath(os.path.join(args.outdir, args.prefix + "_figures.pdf"))) #-------------------------------------------------------------------------------------------------------------# - #------------------------------------------------- Setup logger ----------------------------------------------# + #-------------------------------------------- Setup logger and pool ------------------------------------------# #-------------------------------------------------------------------------------------------------------------# - logger = create_logger(args.verbosity, args.log) - - logger.comment("#TOBIAS BINDetect (run started {0})\n".format(begin_time)) - logger.comment("#Command line call: {0}\n".format(" ".join(sys.argv))) + logger = TobiasLogger("BINDetect", args.verbosity) + logger.begin() parser = add_bindetect_arguments(argparse.ArgumentParser()) - logger.comment(arguments_overview(parser, args)) - - logger.comment("# ----- Output files -----") - for outf in outfiles: - if outf != None: - logger.comment("# {0}".format(outf)) - logger.comment("\n") + logger.arguments_overview(parser, args) + logger.output_files(outfiles) # Setup pool - worker_cores = max(1, args.cores - 1) #max(1, int(args.cores * 0.9)) - writer_cores = 1 #int(args.cores * 0.1)) + writer_cores = max(1, int(args.cores*0.1)) + worker_cores = max(1, args.cores - writer_cores) logger.debug("Worker cores: {0}".format(worker_cores)) logger.debug("Writer cores: {0}".format(writer_cores)) pool = mp.Pool(processes=worker_cores) writer_pool = mp.Pool(processes=writer_cores) - args.no_overwrite = False #leftover from earlier; remove - - #-------------------------------------------------------------------------------------------------------------# #-------------------------- Pre-processing data: Reading motifs, sequences, peaks ----------------------------# #-------------------------------------------------------------------------------------------------------------# - logger.critical("Processing input data") + logger.info("----- Processing input data -----") #Check opening/writing of files logger.info("Checking reading/writing of files") @@ -170,10 +163,15 @@ def run_bindetect(args): #Comparisons between conditions no_conditions = len(args.signals) - comparisons = list(itertools.combinations(args.cond_names, 2)) + if args.time_series: + comparisons = list(zip(args.cond_names[:-1], args.cond_names[1:])) + args.comparisons = comparisons + else: + comparisons = list(itertools.combinations(args.cond_names, 2)) #all-against-all + args.comparisons = comparisons #Open figure pdf and write overview - fig_out = os.path.abspath(os.path.join(args.outdir, "bindetect_figures.pdf")) + fig_out = os.path.abspath(os.path.join(args.outdir, args.prefix + "_figures.pdf")) figure_pdf = PdfPages(fig_out, keep_empty=True) plt.figure() @@ -231,6 +229,7 @@ def run_bindetect(args): fasta_chroms = fasta_obj.references if not set(peak_chroms).issubset(fasta_chroms): + logger.error() sys.exit("ERROR: Chromosome(s) found in peaks ({0}) are not a subset of input FASTA file ({1})".format(peak_chroms, fasta_chroms)) logger.info("Estimating GC content from peak sequences") @@ -250,7 +249,7 @@ def run_bindetect(args): #Check if format of motif file was right, otherwise write out error for motif in motif_list: - rows, cols = np.array(motif.pfm).shape + rows, cols = np.array(motif.counts).shape if rows != 4: sys.exit("ERROR: Motif {0} has an unexpected format and could not be read - please check that the input --motifs file is in either JASPAR/PFM/MEME format.") @@ -265,12 +264,13 @@ def run_bindetect(args): motif_names = [motif.name for motif in motif_list] logger.debug("Getting reverse motifs") motif_list.extend([motif.get_reverse() for motif in motif_list]) + logger.spam(motif_list) for motif in motif_list: #now with reverse motifs as well motif.set_name(args.naming) motif.name = filafy(motif.name) #remove ()/: etc. which will create problems in filenames motif.bg = bg - logger.debug("Getting pssm for motif {0}".format(motif.name)) + logger.spam("Getting pssm for motif {0}".format(motif.name)) motif.get_pssm() motif_names = list(set([motif.name for motif in motif_list])) @@ -279,41 +279,12 @@ def run_bindetect(args): logger.debug("Getting match threshold per motif") outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.motif_pvalue])) motif_list = MotifList(outlist) - - - - #-------------------------------------------------------------------------------------------------------------# - #------------ TF names are known -> test whether they are already calculated -> create subdirs ---------------# - #-------------------------------------------------------------------------------------------------------------# - - missing_results = [] - - #Test whether files for these motifs already exist - for TF in motif_names: - overview_file = os.path.join(args.outdir, TF, TF + "_overview.txt") #Overview file - - #Check whether file exist - if not os.path.exists(overview_file): - logger.debug("Did not find any existing results for {0}".format(TF)) - missing_results.append(TF) - else: - logger.debug("Found previous results for {0}".format(TF)) - - #Choose which motifs to predict binding of - if args.no_overwrite == False: #default - motif_list_predict = motif_list # Run bindetect with all input motifs - motif_names_predict = list(set([motif.name for motif in motif_list])) - else: - #Run bindetect only on missing results - motif_list_predict = MotifList([motif_obj for motif_obj in motif_list if motif_obj.name in missing_results]) - motif_names_predict = list(set([motif.name for motif in motif_list_predict])) - logger.info("Prediction run on missing motifs ({0} new motifs).".format(len(missing_results))) - - args.new_motifs = motif_names_predict + for motif in motif_list: + logger.debug("Motif {0}: threshold {1}".format(motif.name, motif.threshold)) logger.info("Creating folder structure for each TF") for TF in motif_names: - logger.debug("Creating directories for {0}".format(TF)) + logger.spam("Creating directories for {0}".format(TF)) make_directory(os.path.join(args.outdir, TF)) make_directory(os.path.join(args.outdir, TF, "beds")) make_directory(os.path.join(args.outdir, TF, "plots")) @@ -324,21 +295,21 @@ def run_bindetect(args): #-------------------------------------------------------------------------------------------------------------# logger.comment("") + + logger.start_logger_queue() #start process for listening and handling through the main logger queue + args.log_q = logger.queue #queue for multiprocessing logging + manager = mp.Manager() - - logger.debug("Setting up listener for log") - log_q = mp.Manager().Queue() - listener = mp.Process(target=main_logger_process, args=(log_q, logger)) - listener.start() - logger.info("Scanning for motifs and matching to signals...") - #Create writer queues for bed-file output + #Create writer queues for bed-file output logger.debug("Setting up writer queues") qs_list = [] - qs = {} - #finished = [] - TF_names_chunks = [motif_names_predict[i::writer_cores] for i in range(writer_cores)] + writer_qs = {} + + #create_writer_queue(key2file, writer_cores) + manager = mp.Manager() + TF_names_chunks = [motif_names[i::writer_cores] for i in range(writer_cores)] for TF_names_sub in TF_names_chunks: logger.debug("Creating writer queue for {0}".format(TF_names_sub)) files = [os.path.join(args.outdir, TF, "beds", TF + ".tmp") for TF in TF_names_sub] @@ -348,34 +319,27 @@ def run_bindetect(args): writer_pool.apply_async(file_writer, args=(q, dict(zip(TF_names_sub,files)), args)) #, callback = lambda x: finished.append(x) print("Writing time: {0}".format(x))) for TF in TF_names_sub: - qs[TF] = q + writer_qs[TF] = q writer_pool.close() #no more jobs applied to writer_pool + #todo: use run_parallel #Start working on data if worker_cores == 1: - logger.info("Running with cores = 1") + logger.debug("Running with cores = 1") results = [] for chunk in peak_chunks: - results.append(scan_and_score(chunk, motif_list, args, log_q, qs)) + results.append(scan_and_score(chunk, motif_list, args, args.log_q, writer_qs)) else: logger.debug("Sending jobs to worker pool") - task_list = [pool.apply_async(scan_and_score, (chunk, motif_list, args, log_q, qs, )) for chunk in peak_chunks] + task_list = [pool.apply_async(scan_and_score, (chunk, motif_list, args, args.log_q, writer_qs, )) for chunk in peak_chunks] monitor_progress(task_list, logger) results = [task.get() for task in task_list] logger.info("Done scanning for TFBS across regions!") - #---------------------------------------# - logger.debug("Waiting for listener to finish") - log_q.put(None) - while listener.exitcode != 0: - logger.debug("Listener exitcode is: {0}".format(listener.exitcode)) - time.sleep(1) - - logger.debug("Joining listener") - listener.join() + logger.stop_logger_queue() #stop the listening process (wait until all was written) #--------------------------------------# logger.info("Waiting for bedfiles to write") @@ -398,11 +362,8 @@ def run_bindetect(args): #-------------------------------------------------------------------------------------------------------------# logger.info("Merging results from subsets") - background = {} - TF_overlaps = {} - for result in results: - merge_dicts(background, result[0]) - merge_dicts(TF_overlaps, result[1]) + background = merge_dicts([result[0] for result in results]) + TF_overlaps = merge_dicts([result[1] for result in results]) results = None for bigwig in args.cond_names: @@ -415,34 +376,39 @@ def run_bindetect(args): args.thresholds = {} pseudos = [] figures = [] #save figures before saving to file to unify x-ranges - accessible = {} + gmms = {} + all_log_params = {} + for bigwig in args.cond_names: #Prepare scores (remove 0's etc.) bg_values = np.copy(background["signal"][bigwig]) logger.debug("{0} scores for bigwig {1}".format(len(bg_values), bigwig)) - bg_values = bg_values[np.logical_not(np.isclose(bg_values, 0.0))] + bg_values = bg_values[np.logical_not(np.isclose(bg_values, 0.0))] #only non-zero counts x_max = np.percentile(bg_values, [99]) + bg_values = bg_values[bg_values < x_max] #Fit mixture of normals lowest_bic = np.inf - for n_components in range(1,3): #1/2 components + for n_components in [2]: #2 components gmm = sklearn.mixture.GaussianMixture(n_components=n_components, random_state=1) gmm.fit(np.log(bg_values).reshape(-1, 1)) - + bic = gmm.bic(np.log(bg_values).reshape(-1,1)) logger.debug("n_compontents: {0} | bic: {1}".format(n_components, bic)) if bic < lowest_bic: lowest_bic = bic best_gmm = gmm gmm = best_gmm + gmms[bigwig] = best_gmm #Extract most-right gaussian means = gmm.means_.flatten() sds = np.sqrt(gmm.covariances_).flatten() - chosen_i = np.argmax(means) #Mixture with largest mean + chosen_i = np.argmax(means) #Mixture with largest mean log_params = scipy.stats.lognorm.fit(bg_values[bg_values < x_max], f0=sds[chosen_i], fscale=np.exp(means[chosen_i])) + all_log_params[bigwig] = log_params #Plot mixture if args.debug: @@ -452,29 +418,46 @@ def run_bindetect(args): for i in range(2): pdf = scipy.stats.norm.pdf(x, means[i], sds[i]) plt.plot(x, pdf) - + logprob = gmm.score_samples(x.reshape(-1, 1)) df = np.exp(logprob) plt.plot(x, df) plt.show() - #Estimate threshold and pseudocount - threshold = round(scipy.stats.lognorm.ppf(1-args.bound_threshold, *log_params), 5) - args.thresholds[bigwig] = threshold - logger.info("- Threshold for condition {0} estimated at: {1}".format(bigwig, threshold)) - #Mode of distribution mode = scipy.optimize.fmin(lambda x: -scipy.stats.lognorm.pdf(x, *log_params), 0, disp=False)[0] + logger.debug("- Mode for condition {0} estimated at: {1}".format(bigwig, mode)) pseudo = mode / 2.0 #pseudo is half the mode pseudos.append(pseudo) + # Estimate theoretical normal for threshold + leftside_x = np.linspace(scipy.stats.lognorm(*log_params).ppf([0.01]), mode, 100) + leftside_pdf = scipy.stats.lognorm.pdf(leftside_x, *log_params) + + #Flip over + mirrored_x = np.concatenate([leftside_x, np.max(leftside_x) + leftside_x]) + mirrored_pdf = np.concatenate([leftside_pdf, leftside_pdf[::-1]]) + + popt, cov = curve_fit(lambda x, std, sc: sc * scipy.stats.norm.pdf(x, mode, std), mirrored_x, mirrored_pdf) + norm_params = (mode, popt[0]) + logger.debug("Theoretical normal parameters: {0}".format(norm_params)) + + #Set threshold for bound/unbound + threshold = round(scipy.stats.norm.ppf(1-args.bound_pvalue, *norm_params), 5) + args.thresholds[bigwig] = threshold + logger.stats("- Threshold for condition {0} estimated at: {1}".format(bigwig, threshold)) + #Plot fit fig, ax = plt.subplots(1, 1) ax.hist(bg_values[bg_values < x_max], bins='auto', density=True, label="Observed score distribution") xvals = np.linspace(0, x_max, 1000) - probas = scipy.stats.lognorm.pdf(xvals, *log_params) - ax.plot(xvals, probas, label="Log-normal fit", color="orange") + log_probas = scipy.stats.lognorm.pdf(xvals, *log_params) + ax.plot(xvals, log_probas, label="Log-normal fit", color="orange") + + #Theoretical normal + norm_probas = scipy.stats.norm.pdf(xvals, *norm_params) + ax.plot(xvals, norm_probas * (np.max(log_probas) / np.max(norm_probas)), color="grey", linestyle="--", label="Theoretical normal") ax.axvline(threshold, color="black", label="Bound/unbound threshold") ymax = plt.ylim()[1] @@ -501,8 +484,7 @@ def run_bindetect(args): args.pseudo = np.mean(pseudo) logger.info("Pseudocount estimated at: {0}".format(round(args.pseudo, 5))) - - + ############ Foldchanges between conditions ################ logger.comment("") log2fc_params = {} @@ -512,92 +494,47 @@ def run_bindetect(args): for (bigwig1, bigwig2) in comparisons: #cond1, cond2 logger.info("- {0} / {1}".format(bigwig1, bigwig2)) + #Estimate background log2fc scores1 = np.copy(background["signal"][bigwig1]) scores2 = np.copy(background["signal"][bigwig2]) - gcs = np.copy(np.array(background["gc"])) - - #Exclude positions where scores1 = scores2 = 0 - included = np.logical_not(np.logical_and(np.isclose(scores1, 0), np.isclose(scores2, 0))) - scores1, scores2, gcs = scores1[included], scores2[included], gcs[included] + + included = np.logical_or(scores1 > 0, scores2 > 0) + scores1 = scores1[included] + scores2 = scores2[included] + #Calculate background log2fc normal disitribution log2fcs = np.log2(np.true_divide(scores1 + args.pseudo, scores2 + args.pseudo)) - values = np.vstack([log2fcs, gcs]) - - if values.shape[1] == 0: - sys.exit("ERROR: Bigwig values of conditions {0} and {1} are equal or contain only zeroes - please check your input data.".format(bigwig1, bigwig2)) - - #Fit mixture to describe relationship between GC/log2fc - lowest_bic = np.inf - n_components_range = range(1,10) - for n_components in n_components_range: - gmm = sklearn.mixture.GaussianMixture(n_components=n_components, covariance_type="full", random_state=1) - gmm.fit(values.T) - bic = gmm.bic(values.T) - - logger.debug("n_compontents: {0} | bic: {1}".format(n_components, bic)) - if bic < lowest_bic: - lowest_bic = bic - best_gmm = gmm - - log2fc_params[(bigwig1, bigwig2)] = best_gmm - - #Create plot - nullfmt = NullFormatter() - xmin, xmax = np.percentile(log2fcs, [1,99]) - ymin, ymax = np.percentile(gcs, [1,99]) - # definitions for the axes - left, width = 0.1, 0.65 - bottom, height = 0.1, 0.65 - bottom_h = left_h = left + width + 0.02 - - rect_scatter = [left, bottom, width, height] - rect_histx = [left, bottom_h, width, 0.2] - rect_histy = [left_h, bottom, 0.2, height] - - #Initialize figure - plt.figure(1, figsize=(8, 8)) - axScatter = plt.axes(rect_scatter) - axHistx = plt.axes(rect_histx) - axHisty = plt.axes(rect_histy) - - #Gaussian mixture - X, Y = np.meshgrid(np.linspace(xmin,xmax,100), np.linspace(ymin,ymax,100)) - positions = np.array([X.ravel(), Y.ravel()]).T - Z = np.exp(best_gmm.score_samples(positions)) - Z = np.reshape(Z, X.shape) - - zmax = np.percentile(Z, [99]) - axScatter.contourf(X, Y, Z, 20, cmap=plt.cm.viridis) - #axScatter.scatter(log2fcs, gcs, s=0.5, alpha=0.5) - - #Histograms - axHistx.hist(log2fcs[np.logical_and(xmin < log2fcs, log2fcs < xmax)], bins='auto', density=True, color="darkslateblue") - axHisty.hist(gcs[np.logical_and(ymin < gcs, gcs < ymax)], bins='auto', orientation='horizontal', density=True, color="darkslateblue") - axHistx.set_xlim(axScatter.get_xlim()) - axHisty.set_ylim(axScatter.get_ylim()) - axHisty.yaxis.set_major_formatter(nullfmt) - axHistx.xaxis.set_major_formatter(nullfmt) - axHistx.yaxis.set_major_formatter(nullfmt) - axHistx.xaxis.set_major_formatter(nullfmt) - - #Decorate - axScatter.set_xlabel("Log2 fold change") - axScatter.set_ylabel("GC content") - axHistx.set_title("Background log2FCs ({0} / {1})".format(bigwig1, bigwig2)) - - figure_pdf.savefig(bbox_inches='tight') + lower, upper = np.percentile(log2fcs, [1,99]) + log2fcs_fit = log2fcs[np.logical_and(log2fcs >= lower, log2fcs <= upper)] + + norm_params = scipy.stats.norm.fit(log2fcs_fit) + + logger.debug("({0} / {1}) Background log2fc normal distribution: {2}".format(bigwig1, bigwig2, norm_params)) + log2fc_params[(bigwig1, bigwig2)] = norm_params + + #Plot background log2fc to figures + fig, ax = plt.subplots(1, 1) + plt.hist(log2fcs, density=True, bins='auto', label="Background log2fc ({0} / {1})".format(bigwig1, bigwig2)) + + xvals = np.linspace(plt.xlim()[0], plt.xlim()[1], 100) + pdf = scipy.stats.norm.pdf(xvals, *log2fc_params[(bigwig1, bigwig2)]) + plt.plot(xvals, pdf, label="Normal distribution fit") + plt.title("Background log2FCs ({0} / {1})".format(cond1, cond2)) + + plt.xlabel("Log2 fold change") + plt.ylabel("Density") + figure_pdf.savefig(fig, bbox_inches='tight') plt.close() background = None #free up space - #-------------------------------------------------------------------------------------------------------------# #----------------------------- Read total sites per TF to estimate bound/unbound -----------------------------# #-------------------------------------------------------------------------------------------------------------# logger.comment("") - logger.critical("Processing scanned TFBS individually") + logger.info("Processing scanned TFBS individually") #Getting bindetect table ready info_columns = ["total_tfbs"] @@ -611,37 +548,50 @@ def run_bindetect(args): #Starting calculations results = [] if args.cores == 1: - for name in motif_names_predict: + for name in motif_names: logger.info("- {0}".format(name)) results.append(process_tfbs(name, args, log2fc_params)) else: - task_list = [pool.apply_async(process_tfbs, (name, args, log2fc_params)) for name in motif_names_predict] + task_list = [pool.apply_async(process_tfbs, (name, args, log2fc_params)) for name in motif_names] monitor_progress(task_list, logger) #will not exit before all jobs are done results = [task.get() for task in task_list] logger.info("Concatenating results from subsets") info_table = pd.concat(results) #pandas tables - #index_names = info_table.index pool.terminate() pool.join() + #-------------------------------------------------------------------------------------------------------------# + #------------------------------------------------ Cluster TFBS -----------------------------------------------# + #-------------------------------------------------------------------------------------------------------------# + + clustering = RegionCluster(TF_overlaps) + clustering.cluster() + + #Write out distance matrix + matrix_out = os.path.join(args.outdir, args.prefix + "_distances.txt") + clustering.write_distance_mat(matrix_out) + #-------------------------------------------------------------------------------------------------------------# #----------------------------------------- Write all_bindetect file ------------------------------------------# #-------------------------------------------------------------------------------------------------------------# logger.comment("") logger.info("Writing all_bindetect files") - + #Condition specific info_table["total_tfbs"] = info_table["total_tfbs"].map(int) for condition in args.cond_names: info_table[condition + "_bound"] = info_table[condition + "_bound"].map(int) - #info_table[condition + "_threshold"] = info_table[condition + "_threshold"].round(5) - + + #Add cluster to info_table + cluster_names = [clustering.name2cluster[name] for name in info_table.index] + info_table.insert(0,"cluster", cluster_names) + #### Write excel ### - bindetect_excel = os.path.join(args.outdir, "bindetect_results.xlsx") + bindetect_excel = os.path.join(args.outdir, args.prefix + "_results.xlsx") writer = pd.ExcelWriter(bindetect_excel, engine='xlsxwriter') info_table.to_excel(writer) @@ -657,8 +607,9 @@ def run_bindetect(args): info_table[base + "_pvalue"] = info_table[base + "_pvalue"].map("{:.5E}".format) #Write bindetect results tables - bindetect_out = os.path.join(args.outdir, "bindetect_results.txt") info_table.insert(0, "TF_name", info_table.index) #Set index as first column + + bindetect_out = os.path.join(args.outdir, args.prefix + "_results.txt") info_table.to_csv(bindetect_out, sep="\t", index=False, header=True, na_rep="NA") @@ -669,30 +620,21 @@ def run_bindetect(args): if no_conditions > 1: logger.info("Creating BINDetect plot(s)") - #Create distance matrix - distance_matrix, names = overlap_to_distance(TF_overlaps) - matrix_out = os.path.join(args.outdir, "TF_distance_matrix.txt") - np.savetxt(matrix_out, distance_matrix, delimiter="\t", header="\t".join(names), fmt="%.4f") - - #Cluster distance matrix - - #Test index names against names - #index_names = info_table.index - #for name in names: - # if name not in index_names: - # logger.info("{0} not in index".format(name)) - #Plotting bindetect per comparison for (cond1, cond2) in comparisons: logger.info("- {0} / {1}".format(cond1, cond2)) - base = cond1 + "_" + cond2 - changes = [float(info_table.at[name, base + "_change"]) for name in names] - pvalues = [float(info_table.at[name, base + "_pvalue"]) for name in names] - #Diffbind plot - fig = plot_bindetect(names, distance_matrix, changes, pvalues, [cond1, cond2], change_threshold=0.2) + #Make copy of motifs and fill in with metadata + comparison_motifs = copy.deepcopy(motif_list) + for motif in comparison_motifs: + name = motif.name + motif.change = float(info_table.at[name, base + "_change"]) + motif.pvalue = float(info_table.at[name, base + "_pvalue"]) + + #bindetect plot + fig = plot_bindetect(comparison_motifs, clustering, [cond1, cond2], args) figure_pdf.savefig(fig, bbox_inches='tight') @@ -701,11 +643,7 @@ def run_bindetect(args): #-------------------------------------------------------------------------------------------------------------# figure_pdf.close() - - end_time = datetime.now() - logger.comment("") - logger.info("Finished BINDetect run (time elapsed: {0}). Results are found in: {1}".format(end_time - begin_time, args.outdir)) - + logger.end() #--------------------------------------------------------------------------------------------------------# diff --git a/tobias/footprinting/BINDetect_functions.py b/tobias/footprinting/BINDetect_functions.py index e756099..1ecbf53 100644 --- a/tobias/footprinting/BINDetect_functions.py +++ b/tobias/footprinting/BINDetect_functions.py @@ -18,8 +18,14 @@ import random #Plotting +import matplotlib import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec from matplotlib.backends.backend_pdf import PdfPages +from matplotlib.ticker import NullFormatter +from cycler import cycler +from matplotlib.lines import Line2D +from adjustText import adjust_text #Bio-specific packages import pyBigWig @@ -36,6 +42,7 @@ from tobias.utils.signals import * from tobias.plotting.plot_bindetect import * +#------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------# @@ -55,46 +62,6 @@ def get_gc_content(regions, fasta): return(gc_content) -def find_nearest_idx(array, value): - idx = np.abs(array - value).argmin() - return(idx) - -""" -def is_nan(x): - return (x is np.nan or x != x) -""" - -### Logger runs in main process -def main_logger_process(queue, logger): - - logger.debug("Started main logger process") - while True: - try: - record = queue.get() - if record is None: - break - logger.handle(record) - - except Exception: - import sys, traceback - print('Problem in main logger process:', file=sys.stderr) - traceback.print_exc(file=sys.stderr) - break - - return(1) - -""" -def norm_fit(x, loc, scale, size): - size scales the height of the pdf-curve - return(scipy.stats.norm.pdf(x, loc, scale) * size) - - -def lognorm_fit(x, s, loc, scale, size): - size scales the height of the pdf-curve - - y = scipy.stats.lognorm.pdf(x, s, loc=loc, scale=scale) * size - return(y) -""" #---------------------------------------------------------------------------------------------------------# #------------------------------------------- Main functions ----------------------------------------------# @@ -103,12 +70,11 @@ def lognorm_fit(x, s, loc, scale, size): def scan_and_score(regions, motifs_obj, args, log_q, qs): """ Scanning and scoring runs in parallel for subsets of regions """ - logger = create_mp_logger(args.verbosity, log_q) #sending all logger calls to log_q + logger = TobiasLogger("", args.verbosity, log_q) #sending all logger calls to log_q - logger.debug("Setting scanner") + logger.debug("Setting up scanner/bigwigs/fasta") motifs_obj.setup_moods_scanner() #MotifList object - logger.debug("Opening bigwigs and fasta") pybw = {condition:pyBigWig.open(args.signals[i], "rb") for i, condition in enumerate(args.cond_names)} fasta_obj = pysam.FastaFile(args.genome) chrom_boundaries = dict(zip(fasta_obj.references, fasta_obj.lengths)) @@ -122,9 +88,10 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): #TODO: Estimate number of background positions sampled to pre-allocate space ######## Scan for motifs in each region ###### + logger.debug("Scanning for motif occurrences") all_TFBS = {TF: RegionList() for TF in motifs_obj.names} # Dict for saving sites before writing for i, region in enumerate(regions): - logger.debug("Processing region: {0}".format(region.tup())) + logger.spam("Processing region: {0}".format(region.tup())) extra_columns = region @@ -132,7 +99,7 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): reglen = region.get_length() random.seed(reglen) #Each region is processed identifically regardless of order in file rand_positions = random.sample(range(reglen), max(1,int(reglen/rand_window))) #theoretically one in every 500 bp - logger.debug("Random indices: {0} for region length {1}".format(rand_positions, reglen)) + logger.spam("Random indices: {0} for region length {1}".format(rand_positions, reglen)) #Read footprints in region footprints = {} @@ -141,7 +108,7 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): footprints[condition] = pybw[condition].values(region.chrom, region.start, region.end, numpy=True) footprints[condition] = np.nan_to_num(footprints[condition]) #nan to 0 except: - logger.critical("ERROR reading footprints from region: {0}".format(region)) + logger.error("Error reading footprints from region: {0}".format(region)) continue #Read random positions for background @@ -192,10 +159,9 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): all_TFBS[name] = all_TFBS[name].resolve_overlaps() no_sites = len(all_TFBS[name]) - if name in args.new_motifs: # Only write if name was in the requested motifs - logger.debug("Sending {0} sites from {1} to bed-writer queue".format(no_sites, name)) - bed_content = all_TFBS[name].as_bed() #string - qs[name].put((name, bed_content)) + logger.spam("Sending {0} sites from {1} to bed-writer queue".format(no_sites, name)) + bed_content = all_TFBS[name].as_bed() #string + qs[name].put((name, bed_content)) global_TFBS.extend(all_TFBS[name]) all_TFBS[name] = [] @@ -207,7 +173,8 @@ def scan_and_score(regions, motifs_obj, args, log_q, qs): for bigwig_f in pybw: pybw[bigwig_f].close() - end_time = datetime.now() + logger.stop() + logger.total_time return(background_signal, overlap) @@ -217,15 +184,17 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf """ Processes single TFBS to split into bound/unbound and write out overview file """ begin_time = datetime.now() + logger = TobiasLogger("", args.verbosity, args.log_q) #sending all logger calls to log_q + #pre-scanned sites to read bed_outdir = os.path.join(args.outdir, TF_name, "beds") filename = os.path.join(bed_outdir, TF_name + ".tmp") no_cond = len(args.cond_names) - comparisons = list(itertools.combinations(args.cond_names, 2)) + comparisons = args.comparisons #Get info table ready info_columns = ["total_tfbs"] - info_columns.extend(["{0}_{1}".format(cond, metric) for (cond, metric) in itertools.product(args.cond_names, ["bound"])]) + info_columns.extend(["{0}_{1}".format(cond, metric) for (cond, metric) in itertools.product(args.cond_names, ["mean_score", "bound"])]) info_columns.extend(["{0}_{1}_{2}".format(comparison[0], comparison[1], metric) for (comparison, metric) in itertools.product(comparisons, ["change", "pvalue"])]) rows, cols = 1, len(info_columns) info_table = pd.DataFrame(np.zeros((rows, cols)), columns=info_columns, index=[TF_name]) @@ -257,7 +226,7 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf for condition in args.cond_names: bed_table[condition + "_score"] = bed_table[condition + "_score"].round(5) - #### Write all file #### + #### Write _all file #### chosen_columns = [col for col in header if col != "GC"] outfile = os.path.join(bed_outdir, TF_name + "_all.bed") bed_table.to_csv(outfile, sep="\t", index=False, header=False, columns=chosen_columns) @@ -267,6 +236,8 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf threshold = args.thresholds[condition] bed_table[condition + "_bound"] = np.where(bed_table[condition + "_score"] > threshold, 1, 0).astype(int) + + info_table.at[TF_name, condition + "_mean_score"] = round(np.mean(bed_table[condition + "_score"]), 5) info_table.at[TF_name, condition + "_bound"] = np.sum(bed_table[condition + "_bound"].values) #_bound contains bool 0/1 #Write bound/unbound @@ -283,6 +254,7 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf chosen_columns = header[:-no_cond-1] + [condition + "_score"] bed_table_subset.to_csv(outfile, sep="\t", index=False, header=False, columns=chosen_columns) + #### Calculate statistical test in comparison to background #### fig_out = os.path.abspath(os.path.join(args.outdir, TF_name, "plots", TF_name + "_log2fcs.pdf")) log2fc_pdf = PdfPages(fig_out, keep_empty=True) @@ -297,27 +269,14 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf bed_table[base + "_log2fc"] = bed_table[base + "_log2fc"].round(5) # Compare log2fcs to background log2fcs - excluded = np.logical_and(np.isclose(bed_table[cond1 + "_score"].values, 0), np.isclose(bed_table[cond2 + "_score"].values, 0)) #exclude 0's from both conditions - subset = bed_table[np.logical_not(excluded)].copy() #included subset + included = np.logical_or(bed_table[cond1 + "_score"].values > 0, bed_table[cond2 + "_score"].values > 0) + subset = bed_table[included].copy() #included subset subset.loc[:,"peak_id"] = ["_".join([chrom, str(start), str(end)]) for (chrom, start, end) in zip(subset["peak_chr"].values, subset["peak_start"].values, subset["peak_end"].values)] + observed_log2fcs = subset.groupby('peak_id')[base + '_log2fc'].mean().reset_index()[base + "_log2fc"].values #if more than one TFBS per peak -> take mean value - observed_gcs = subset.groupby('peak_id')["GC"].mean().reset_index()["GC"].values #if more than one TFBS per peak -> take mean value - - #Resample from background - log2fc_params[(cond1, cond2)].set_params(random_state=len(observed_log2fcs)) - sampling, labels = log2fc_params[(cond1, cond2)].sample(int(info_table.at[TF_name, "total_tfbs"]*2)) - sampled_log2fcs, sampled_gcs = sampling[:,0], sampling[:,1] - - indices = [] - for val in observed_gcs: - idx = find_nearest_idx(sampled_gcs, val) - indices.append(idx) - sampled_gcs[idx] = np.inf - - bg_log2fcs = sampled_log2fcs[indices] #Estimate mean/std - bg_params = scipy.stats.norm.fit(bg_log2fcs) + bg_params = log2fc_params[(cond1, cond2)] obs_params = scipy.stats.norm.fit(observed_log2fcs) obs_mean, obs_std = obs_params @@ -326,7 +285,7 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf #If there was any change found at all (0 can happen if two bigwigs are the same) if obs_mean != bg_mean: - info_table.at[TF_name, base + "_change"] = (obs_mean - bg_mean) / bg_std #effect size + info_table.at[TF_name, base + "_change"] = (obs_mean - bg_mean) / np.mean([obs_std, bg_std]) #effect size info_table.at[TF_name, base + "_change"] = np.round(info_table.at[TF_name, base + "_change"], 5) #pval = scipy.stats.mannwhitneyu(observed_log2fcs, bg_log2fcs, alternative="two-sided")[1] @@ -338,25 +297,33 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf info_table.at[TF_name, base + "_change"] = 0 info_table.at[TF_name, base + "_pvalue"] = 1 - # Plot differences of distributions - fig, ax = plt.subplots(1, 1) - ax.hist(observed_log2fcs, density=True, bins=50, color="red", label="Observed log2fcs", alpha=0.5) - ax.hist(bg_log2fcs, density=True, bins=50, color="black", label="Background log2fcs", alpha=0.6) + #### Plot comparison ### + fig, ax = plt.subplots(1,1) + ax.hist(observed_log2fcs, bins='auto', label="Observed log2fcs", density=True) + xvals = np.linspace(plt.xlim()[0], plt.xlim()[1], 100) + + #Observed distribution + pdf = scipy.stats.norm.pdf(xvals, *obs_params) + ax.plot(xvals, pdf, label="Observed distribution (fit)", color="red", linestyle="--") ax.axvline(obs_mean, color="red", label="Observed mean") - ax.axvline(bg_mean, color="black", label="Background mean") + #Background distribution + pdf = scipy.stats.norm.pdf(xvals, *bg_params) + ax.plot(xvals, pdf, label="Background distribution (fit)", color="Black", linestyle="--") + ax.axvline(bg_mean, color="black", label="Background mean") + + #Set size x0,x1 = ax.get_xlim() y0,y1 = ax.get_ylim() - ax.set_aspect(((x1-x0)/(y1-y0)) / 1.5) #square volcano plot - - #Add text showing change - ax.text(0.05, 0.95, "Diff. score: {0:.3f}".format(info_table.at[TF_name, base + "_change"]), va="top", transform = ax.transAxes) + ax.set_aspect(((x1-x0)/(y1-y0)) / 1.5) - plt.xlabel("Log2(fold change)") + #Decorate + ax.legend() + plt.xlabel("Log2 fold change") plt.ylabel("Density") plt.title("Differential binding for TF \"{0}\"\nbetween ({1} / {2})".format(TF_name, cond1, cond2)) ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') - + plt.tight_layout() log2fc_pdf.savefig(fig, bbox_inches='tight') plt.close(fig) @@ -388,6 +355,162 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf try: os.remove(filename) except: - print("Error removing temporary file {0} - this does not effect the results of BINDetect.".format(filename) ) + logger.error("Could not remove temporary file {0} - this does not effect the results of BINDetect.".format(filename) ) return(info_table) + + + +#------------------------------------------------------------------------------------------------------# +#------------------------------------------------------------------------------------------------------# +#------------------------------------------------------------------------------------------------------# + +def plot_bindetect(motifs, clusters, conditions, args): + """ Conditions refer to the order of the fold_change divison, meaning condition1/condition2 + - Clusters is a RegionCluster object + - conditions is a tup of condition names (cond1, cond2) + """ + warnings.filterwarnings("ignore") + + cond1, cond2 = conditions + no_IDS = clusters.n + + #Link information from motifs / clusters + diff_scores = {} + for motif in motifs: + diff_scores[motif.name] = {"change": motif.change, + "pvalue": motif.pvalue, + "log10pvalue": -np.log10(motif.pvalue) if motif.pvalue > 0 else -np.log10(1e-308), #smallest possible number before python underflows + "volcano_label": motif.alt_name, #shorter name + "overview_label": "{0} ({1})".format(motif.alt_name, motif.id) #the name which was output used in bindetect output + } + + xvalues = np.array([diff_scores[TF]["change"] for TF in diff_scores]) + yvalues = np.array([diff_scores[TF]["log10pvalue"] for TF in diff_scores]) + + #### Define the TFs to plot IDs for #### + y_min = np.percentile(yvalues[yvalues < -np.log10(1e-300)], 95) + x_min, x_max = np.percentile(xvalues, [5,95]) + + for TF in diff_scores: + if diff_scores[TF]["change"] < x_min or diff_scores[TF]["change"] > x_max or diff_scores[TF]["log10pvalue"] > y_min: + diff_scores[TF]["show"] = True + if diff_scores[TF]["change"] < 0: + diff_scores[TF]["color"] = "blue" + elif diff_scores[TF]["change"] > 0: + diff_scores[TF]["color"] = "red" + else: + diff_scores[TF]["show"] = False + diff_scores[TF]["color"] = "black" + + node_color = clusters.node_color + IDS = np.array(clusters.names) + + #--------------------------------------- Figure --------------------------------# + + #Make figure + no_rows, no_cols = 2,2 + h_ratios = [1,max(1,no_IDS/25)] + figsize = (8,10+7*(no_IDS/25)) + + fig = plt.figure(figsize = figsize) + gs = gridspec.GridSpec(no_rows, no_cols, height_ratios=h_ratios) + gs.update(hspace=0.0001, bottom=0.00001, top=0.999999) + + ax1 = fig.add_subplot(gs[0,:]) #volcano + ax2 = fig.add_subplot(gs[1,0]) #long scatter overview + ax3 = fig.add_subplot(gs[1,1]) #dendrogram + + ######### Volcano plot on top of differential values ######## + ax1.set_title("BINDetect volcano plot", fontsize=16, pad=20) + ax1.scatter(xvalues, yvalues, color="black", s=5) + + #Add +/- 10% to make room for labels + ylim = ax1.get_ylim() + y_extra = (ylim[1] - ylim[0]) * 0.1 + ax1.set_ylim(ylim[0], ylim[1] + y_extra) + + xlim = ax1.get_xlim() + x_extra = (xlim[1] - xlim[0]) * 0.1 + lim = np.max([np.abs(xlim[0]-x_extra), np.abs(xlim[1]+x_extra)]) + ax1.set_xlim(-lim, lim) + + x0,x1 = ax1.get_xlim() + y0,y1 = ax1.get_ylim() + ax1.set_aspect((x1-x0)/(y1-y0)) #square volcano plot + + #Decorate plot + ax1.set_xlabel("Differential binding score") + ax1.set_ylabel("-log10(pvalue)") + + ########### Dendrogram over similarities of TFs ####### + dendro_dat = dendrogram(clusters.linkage_mat, labels=IDS, no_labels=True, orientation="right", ax=ax3, above_threshold_color="black", link_color_func=lambda k: clusters.node_color[k]) + labels = dendro_dat["ivl"] #Now sorted for the order in dendrogram + ax3.set_xlabel("Transcription factor similarities\n(Clusters below threshold are colored)") + + ax3.set_ylabel("Transcription factor clustering based on TFBS overlap", rotation=270, labelpad=20) + ax3.yaxis.set_label_position("right") + + #Set aspect of dendrogram/changes + x0,x1 = ax3.get_xlim() + y0,y1 = ax3.get_ylim() + ax3.set_aspect(((x1-x0)/(y1-y0)) * no_IDS/10) + + ########## Differential binding scores per TF ########## + ax2.set_xlabel("Differential binding score\n" + "(" + cond2 + r' $\leftarrow$' + r'$\rightarrow$ ' + cond1 + ")") #First position in comparison equals numerator in log2fc division + ax2.xaxis.set_label_position('bottom') + ax2.xaxis.set_ticks_position('bottom') + + no_labels = len(labels) + ax2.set_ylim(0.5, no_labels+0.5) + ax2.set_ylabel("Transcription factors") + + ax2.set_yticks(range(1,no_labels+1)) + ax2.set_yticklabels([diff_scores[TF]["overview_label"] for TF in labels]) + ax2.axvline(0, color="grey", linestyle="--") #Plot line at middle + + #Plot scores per TF + for y, TF in enumerate(labels): #labels are the output motif names from output + + + idx = np.where(IDS == TF)[0][0] + score = diff_scores[TF]["change"] + + #Set coloring based on change/pvalue + if diff_scores[TF]["show"] == True: + fill = "full" + else: + fill = "none" + + ax2.axhline(y+1, color="grey", linewidth=1) + ax2.plot(score, y+1, marker='o', color=node_color[idx], fillstyle=fill) + ax2.yaxis.get_ticklabels()[y].set_color(node_color[idx]) + + #Set x-axis ranges + lim = np.max(np.abs(ax2.get_xlim())) + ax2.set_xlim((-lim, lim)) #center on 0 + + #set aspect + x0,x1 = ax2.get_xlim() + y0,y1 = ax2.get_ylim() + ax2.set_aspect(((x1-x0)/(y1-y0)) * no_IDS/10) #square volcano plot + + plt.tight_layout() #tight layout before setting ids in volcano plot + + ######### Color points and set labels in volcano ######## + txts = [] + for TF in diff_scores: + coord = [diff_scores[TF]["change"], diff_scores[TF]["log10pvalue"]] + ax1.scatter(coord[0], coord[1], color=diff_scores[TF]["color"], s=4.5) + + if diff_scores[TF]["show"] == True: + txts.append(ax1.text(coord[0], coord[1], diff_scores[TF]["volcano_label"], fontsize=7)) + + adjust_text(txts, ax=ax1, text_from_points=True, arrowprops=dict(arrowstyle='-', color='black', lw=0.5)) #, expand_text=(0.1,1.2), expand_objects=(0.1,0.1)) + + #Plot custom legend for colors + legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="More bound in {0}".format(conditions[0])), + Line2D([0],[0], marker='o', color='w', markerfacecolor="blue", label="More bound in {0}".format(conditions[1]))] + ax1.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left') + + return(fig) diff --git a/tobias/footprinting/footprint_scores.py b/tobias/footprinting/footprint_scores.py index b2d0eb8..16892ba 100644 --- a/tobias/footprinting/footprint_scores.py +++ b/tobias/footprinting/footprint_scores.py @@ -28,6 +28,7 @@ from tobias.utils.sequences import * from tobias.utils.signals import * from tobias.utils.ngs import * +from tobias.utils.logger import * #-----------------------------------------------------------------# def add_footprint_arguments(parser): @@ -49,22 +50,19 @@ def add_footprint_arguments(parser): optargs = parser.add_argument_group('Optional arguments') optargs.add_argument('--extend', metavar="", type=int, help="Extend input regions with bp (default: 100)", default=100) optargs.add_argument('--score', metavar="", choices=["tobias", "FOS", "sum"], help="Type of scoring to perform on cutsites (tobias/FOS/sum) (default: tobias)", default="tobias") - - optargs.add_argument('--fp_min', metavar="", type=int, help="Minimum footprint width (default: 15)", default=15) + optargs.add_argument('--window', metavar="", type=int, help="For '--score tobias' this is the backround region for measuring effect of binding. For '--score sum' this is the window for calculation of sum (default: 100)", default=100) + optargs.add_argument('--fp_min', metavar="", type=int, help="Minimum footprint width (default: 20)", default=20) optargs.add_argument('--fp_max', metavar="", type=int, help="Maximum footprint width (default: 50)", default=50) - optargs.add_argument('--flank_min', metavar="", type=int, help="Minimum range of flanking regions (default: 10)", default=10) - optargs.add_argument('--flank_max', metavar="", type=int, help="Maximum range of flanking regions (default: 30)", default=30) - optargs.add_argument('--sum_window', metavar="", type=int, help="For --score == sum; Window for calculation of sum (default: 200)", default=200) + optargs.add_argument('--flank_min', metavar="", type=int, help="For FOS; Minimum range of flanking regions (default: 10)", default=10) + optargs.add_argument('--flank_max', metavar="", type=int, help="For FOS; Maximum range of flanking regions (default: 30)", default=30) optargs.add_argument('--smooth', metavar="", type=int, help="Smooth output signal by mean in windows (default: 5)", default=5) - optargs.add_argument('--min_limit', metavar="", type=float, help="Limit input bigwig score range (default: no lower limit)") #default none - optargs.add_argument('--max_limit', metavar="", type=float, help="Limit input bigwig score range (default: no upper limit)") #default none + optargs.add_argument('--min_limit', metavar="", type=float, help="Limit input bigwig score range (default: no lower limit)") #default none + optargs.add_argument('--max_limit', metavar="", type=float, help="Limit input bigwig score range (default: no upper limit)") #default none runargs = parser.add_argument_group('Run arguments') runargs.add_argument('--cores', metavar="", type=int, help="Number of cores to use for computation (default: 1)", default=1) runargs.add_argument('--split', metavar="", type=int, help="Split of multiprocessing jobs (default: 100)", default=100) - runargs.add_argument('--verbosity', metavar="", help="Level of output logging (1 (sparse) / 2 (normal) / 3 (debug)) (default: 2)", choices=[1,2,3], default=2, type=int) - runargs.add_argument('--log', metavar="", help="Full path of logfile (default: log is printed to stdout)") - runargs.add_argument('--debug', help=argparse.SUPPRESS, action='store_true') + runargs = add_logger_args(runargs) return(parser) @@ -75,12 +73,12 @@ def calculate_scores(regions, args): pybw_header = pybw_signal.chroms() chrom_lengths = {chrom:int(pybw_header[chrom]) for chrom in pybw_header} - output_dict = {} - #Set flank to enable scoring in ends of regions if args.score == "sum": - flank = int(args.sum_window/2.0) - elif args.score == "tobias" or args.score == "FOS": + flank = int(args.window/2.0) + elif args.score == "tobias": + flank = int((args.window - args.fp_min)/2) + elif args.score == "FOS": flank = args.flank_max else: flank = args.flank_max @@ -105,13 +103,13 @@ def calculate_scores(regions, args): #Calculate scores if args.score == "sum": signal = np.abs(signal) - scores = fast_rolling_math(signal, args.sum_window, "sum") + scores = fast_rolling_math(signal, args.window, "sum") elif args.score == "FOS": scores = calc_FOS(signal, args.fp_min, args.fp_max, args.flank_min, args.flank_max) elif args.score == "tobias": - scores = tobias_footprint_array(signal, args.fp_min, args.fp_max, args.flank_min, args.flank_max) #numpy array + scores = tobias_footprint_array(signal, args.window, args.fp_min, args.fp_max) #numpy array else: sys.exit("{0} not found".format(args.score)) @@ -123,16 +121,15 @@ def calculate_scores(regions, args): #Remove ends to prevent overlap with other regions if flank > 0: scores = scores[flank:-flank] - output_dict[reg_key] = scores - return(output_dict) + args.writer_qs["scores"].put(("scores", reg_key, scores)) -#------------------------------------------------------------------# + return(1) + +#------------------------------------------------------------------------------------------# def run_footprinting(args): - begin_time = datetime.now() - check_required(args, ["signal", "output", "regions"]) check_files([args.signal, args.regions], "r") check_files([args.output], "w") @@ -141,31 +138,30 @@ def run_footprinting(args): # Create logger and write info to log #---------------------------------------------------------------------------------------# - logger = create_logger(args.verbosity, args.log) - logger.comment("#TOBIAS FootprintScores (run started {0})\n".format(begin_time)) - logger.comment("#Command line call: {0}\n".format(" ".join(sys.argv))) + logger = TobiasLogger("FootprintScores", args.verbosity) + logger.begin() parser = add_footprint_arguments(argparse.ArgumentParser()) - logger.comment(arguments_overview(parser, args)) - - logger.comment("# ----- Output files -----") - logger.comment("# - {0}".format(args.output)) - logger.comment("\n\n") + logger.arguments_overview(parser, args) + logger.output_files([args.output]) + logger.debug("Setting up listener for log") + logger.start_logger_queue() + args.log_q = logger.queue #---------------------------------------------------------------------------------------# #----------------------- I/O - get regions/bigwig ready --------------------------------# #---------------------------------------------------------------------------------------# - logger.critical("Processing input files") - logger.info("Opening cutsite bigwig") + logger.info("Processing input files") + + logger.info("- Opening input cutsite bigwig") pybw_signal = pyBigWig.open(args.signal) pybw_header = pybw_signal.chroms() chrom_info = {chrom:int(pybw_header[chrom]) for chrom in pybw_header} - - logger.info("Getting output regions ready") - + #Decide regions + logger.info("- Getting output regions ready") if args.regions: regions = RegionList().from_bed(args.regions) regions.apply_method(OneRegion.extend_reg, args.extend) @@ -174,88 +170,63 @@ def run_footprinting(args): else: regions = RegionList().from_list([OneRegion([chrom, 0, chrom_info[chrom]]) for chrom in chrom_info]) - #Getting bigwig files ready - logger.info("Opening output bigwig") + #Todo: check boundaries in relation to flanking regions for calculation + + + #Information for output bigwig reference_chroms = sorted(list(chrom_info.keys())) header = [(chrom, chrom_info[chrom]) for chrom in reference_chroms] - pybw_footprints = pyBigWig.open(args.output, "w") - pybw_footprints.addHeader(header) - regions.loc_sort(reference_chroms) + #---------------------------------------------------------------------------------------# #------------------------ Calculating footprints and writing out -----------------------# #---------------------------------------------------------------------------------------# - logger.critical("Calculating footprints in regions...") + logger.info("Calculating footprints in regions...") + regions_chunks = regions.chunks(args.split) + + #Setup pools + writer_cores = 1 + worker_cores = max(1, args.cores - writer_cores) + logger.debug("Worker cores: {0}".format(worker_cores)) + logger.debug("Writer cores: {0}".format(writer_cores)) - if args.debug == True: - fp = calculate_scores(regions[:100], args) + worker_pool = mp.Pool(processes=worker_cores) + writer_pool = mp.Pool(processes=writer_cores) + manager = mp.Manager() - regions_chunks = regions.chunks(args.split) + #Start bigwig file writers + q = manager.Queue() + writer_pool.apply_async(bigwig_writer, args=(q, {"scores":args.output}, header, regions, args)) + writer_pool.close() #no more jobs applied to writer_pool + writer_qs = {"scores": q} + + args.writer_qs = writer_qs - #Start correction + #Start calculating scores pool = mp.Pool(processes=args.cores) task_list = [pool.apply_async(calculate_scores, args=[chunk, args]) for chunk in regions_chunks] no_tasks = len(task_list) pool.close() + monitor_progress(task_list, logger) + results = [task.get() for task in task_list] - chrom_order = {reference_chroms[i]:i for i in range(len(reference_chroms))} - - #Process results as they come in - write_idx = 0 #index in task_list - prev_progress = (-1, -1) #done/write - while write_idx < len(task_list): - - if task_list[write_idx].ready() == True: #Write result to file - - footprints = task_list[write_idx].get() - - #Write tracks to bigwig file - for region in sorted(footprints.keys(), key=lambda tup: (chrom_order[tup[0]], tup[1], tup[2])): #Ensure that positions are written to bigwig in correct order - logger.debug("Processing {0}".format(region)) - signal = footprints[region] - - chrom, reg_start, reg_end = region - positions = np.arange(reg_start, reg_end) #genomic 0-based positions - - zeros = np.logical_or(np.isclose(signal, 0), np.isnan(signal)) - signal[zeros] = 0 #adjust for weird numpy floating point - - included = signal.nonzero()[0] - pos = positions[included] - val = signal[included] - - if len(pos) > 0: - try: - pybw_footprints.addEntries(chrom, pos, values=val, span=1) - except: - logger.critical("Error writing region {0}.".format(region)) - sys.exit() - - #Clear memory - footprints[region] = None - - #Wait for next region to print - write_idx += 1 #This is also the same as the number of prints done - - tasks_done = sum([task.ready() for task in task_list]) - if tasks_done != prev_progress[0] or write_idx != prev_progress[1]: - logger.info("Calculation progress: {0:.0f}% | Writing progress: {1:.0f}%".format(tasks_done/no_tasks*100, write_idx/no_tasks*100)) - - prev_progress = (tasks_done, write_idx) + #Stop all queues for writing + logger.debug("Stop all queues by inserting None") + for q in writer_qs.values(): + q.put((None, None, None)) #Done computing - pool.join() - - #Done writing to files - logger.critical("Closing bigwig file (this can take a while)") - pybw_footprints.close() - - end_time = datetime.now() - logger.critical("Finished FootprintScores run (total time of {0})".format(end_time-begin_time)) + writer_pool.join() + worker_pool.terminate() + worker_pool.join() + + logger.stop_logger_queue() + #Finished scoring + logger.end() #--------------------------------------------------------------------------------------------------------# diff --git a/tobias/misc/__init__.py b/tobias/misc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tobias/misc/maxpos.py b/tobias/misc/maxpos.py new file mode 100644 index 0000000..ef2c498 --- /dev/null +++ b/tobias/misc/maxpos.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +""" +Maxpos: Find the position of the maximum signal for each input region. + +@author: Mette Bentsen +@contact: mette.bentsen (at) mpi-bn.mpg.de +@license: MIT + +""" + +#--------------------------------------------------------------------------------------------------------# +#----------------------------------------- Import libraries ---------------------------------------------# +#--------------------------------------------------------------------------------------------------------# + +import os +import sys +import argparse +import pyBigWig +import numpy as np + +#Utils from TOBIAS +from tobias.utils.regions import * +from tobias.utils.utilities import * +from tobias.utils.logger import * + + +#-------------------------------------------------------------------------------------------# +#-------------------------------- Command line arguments -----------------------------------# +#-------------------------------------------------------------------------------------------# + +def add_maxpos_arguments(parser): + + parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) + description = "MaxPos identifies the position of maximum signal (from bigwig) within a given set of .bed-regions. Used to identify peak of signals such as accessibility or footprint scores.\n\n" + description += "Usage:\nTOBIAS MaxPos --bed --bigwig \n\n" + description += "The output is a 4-column .bed-file (default output is stdout, but you can use --output to set a specific output file).\n" + parser.description = format_help_description("MaxPos", description) + + parser._action_groups.pop() #pop -h + + #Required arguments + required = parser.add_argument_group('Required arguments') + required.add_argument('--bed', metavar="", help="Regions to search for max position within") + required.add_argument('--bigwig', metavar="", help="Scores used to identify maximum value") + + #Optional arguments + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('--output', metavar="", help="Path to output .bed-file (default: scored sites are written to stdout") + optional.add_argument('--invert', help="Find minimum position instead of maximum position", action='store_true', default=False) + + return(parser) + +#--------------------------------------------------------------------------------# +def get_minmax_func(args): + + if args.invert == False: + func = lambda signal: [i for i in range(len(signal)) if signal[i] == np.max(signal)] + else: + func = lambda signal: [i for i in range(len(signal)) if signal[i] == np.min(signal)] + + return(func) + + +#--------------------------------------------------------------------------------# +def run_maxpos(args): + + check_required(args, ["bed", "bigwig"]) + check_files([args.bed, args.bigwig], "r") + check_files([args.output], "w") + + + #Setup output file/stdout + if args.output != None: + sys.stdout = open(args.output, 'w') + + # Setup score function + minmax_func = get_minmax_func(args) + + #----------------------------------------------------------------------# + #---------------------- Open bw and run scoring -----------------------# + #----------------------------------------------------------------------# + + pybw = pyBigWig.open(args.bigwig) + #todo: test length of bigwig to scope of bed + + #Get maximum position(s) for each region + with open(args.bed) as f: + for line in f: + + columns = line.strip().split("\t") + chrom, start, end = columns[0], int(columns[1]), int(columns[2]) + + #Get signal from bigwig + signal = pybw.values(chrom, start, end, numpy=True) + signal = np.nan_to_num(signal) + positions = minmax_func(signal) + + #output is list of index positions -> convert to bed + rel_max_start, rel_max_end = min(positions), max(positions) #position relative to peak start + outline = "\t".join([chrom, str(start+rel_max_start-1), str(start+rel_max_end), ".", str(np.max(signal)), "."]) + print(outline) + + pybw.close() + +#--------------------------------------------------------------------------------------------------------# +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser = add_maxpos_arguments(parser) + args = parser.parse_args() + + if len(sys.argv[1:]) == 0: + parser.print_help() + sys.exit() + + run_maxpos(args) diff --git a/tobias/utils/merge_pdfs.py b/tobias/misc/merge_pdfs.py similarity index 86% rename from tobias/utils/merge_pdfs.py rename to tobias/misc/merge_pdfs.py index d906d24..f1e8e93 100644 --- a/tobias/utils/merge_pdfs.py +++ b/tobias/misc/merge_pdfs.py @@ -1,7 +1,12 @@ +#!/usr/bin/env python +""" +Small utility for mering pdfs -#Small utility for merging pdfs - +@author: Mette Bentsen +@contact: mette.bentsen (at) mpi-bn.mpg.de +@license: MIT +""" from PyPDF2 import PdfFileMerger, PdfFileReader import argparse @@ -16,9 +21,8 @@ def add_mergepdf_arguments(parser): parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) - description = "" + description = "Merge single PDF-files to one file" parser.description = format_help_description("MergePDF", description) - parser._action_groups.pop() #pop -h reqargs = parser.add_argument_group('Required arguments') diff --git a/tobias/utils/subsample_bam.py b/tobias/misc/subsample_bam.py similarity index 83% rename from tobias/utils/subsample_bam.py rename to tobias/misc/subsample_bam.py index 4b0805a..13920ca 100644 --- a/tobias/utils/subsample_bam.py +++ b/tobias/misc/subsample_bam.py @@ -1,11 +1,22 @@ +#!/usr/bin/env python + +""" +SubsampleBam: Samples from bam in percentage-steps with replicates per step + +@author: Mette Bentsen +@contact: mette.bentsen (at) mpi-bn.mpg.de +@license: MIT + +""" + import os import sys import multiprocessing as mp -from datetime import datetime import subprocess import argparse from tobias.utils.utilities import * +from tobias.utils.logger import * #-------------------------------------------------------------------# def run_commandline(command): @@ -36,6 +47,7 @@ def add_subsample_arguments(parser): args.add_argument('--cores', metavar="", type=int, help="Cores for multiprocessing (default: 1)", default=1) args.add_argument('--outdir', metavar="", help="Output directory (default: current working directory)", default=".") args.add_argument('--prefix', metavar="", help="Prefix for output files (default: prefix of .bam)") + args = add_logger_args(args) return(parser) @@ -48,10 +60,9 @@ def run_subsampling(args): args.outdir = os.path.abspath(args.outdir) if args.outdir != None else os.path.abspath(os.getcwd()) #---------------------------------------------------# - stime = datetime.now() - print("-"*70) - print("------- Started bam subsampling ({0}) -------".format(stime)) - print("-"*70 + "\n") + + logger = TobiasLogger("SubsampleBam", args.verbosity) + logger.begin() #### Getting ready for running cmd_calls = [] @@ -74,14 +85,7 @@ def run_subsampling(args): pool = mp.Pool(processes=args.cores) task_list = [pool.apply_async(run_commandline, cmd) for cmd in cmd_calls] pool.close() - - #check progress - while complete_tasks != all_tasks: - complete_tasks = sum([task.ready() for task in task_list]) - if complete_tasks != prev_complete_tasks: - print(complete_tasks / float(all_tasks) * 100.0) - prev_complete_tasks = complete_tasks - + monitor_progress(task_list, logger) pool.join() else: @@ -90,8 +94,7 @@ def run_subsampling(args): complete_tasks += 1 print(complete_tasks / float(all_tasks) * 100.0) - print("Completed bam subsampling! Bamfiles are found at:\n{0}".format(args.outdir)) - + logger.end() #--------------------------------------------------------------------------------------------------------# if __name__ == '__main__': diff --git a/tobias/motifs/cluster_tfbs.py b/tobias/motifs/cluster_tfbs.py new file mode 100644 index 0000000..38efa60 --- /dev/null +++ b/tobias/motifs/cluster_tfbs.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python + +""" +ClusterTFBS: Performs TF clustering based on positions of TFBS across genome by calculating overlaps of sites + +@author: Mette Bentsen +@contact: mette.bentsen (at) mpi-bn.mpg.de +@license: MIT +""" + +import argparse +import tempfile +import sys +import os +import numpy as np +import itertools +from datetime import datetime +import copy +from scipy.spatial.distance import squareform +from scipy.cluster.hierarchy import dendrogram, linkage, fcluster +import matplotlib.pyplot as plt + +from tobias.utils.regions import * +from tobias.utils.utilities import * +from tobias.utils.logger import * + + + +def add_clustering_arguments(parser): + + parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) + description = "" + parser.description = format_help_description("ClusterTFBS", description) + + parser._action_groups.pop() #pop -h + + args = parser.add_argument_group('Arguments') + args.add_argument('-b', '--bedfiles', metavar="", nargs="*", help="Bedfile(s) containing TFBS sites with name in the 4th column") + args.add_argument('-o', '--outdir', metavar="", help="Path of output folder (default: current directory)", default=".") + args.add_argument('-p', '--prefix', metavar="", help="Prefix of output files (default: ClusterTFBS)", default="ClusterTFBS") + args.add_argument('-c', '--cores', metavar="", type=int, help="Number of cores to use for computation (default: 1)", default=1) + args.add_argument('--threshold', metavar="", help="Threshold for clustering (default: 0.5)", type=lambda f: restricted_float(f,0,1), default=0.5) + args.add_argument('--method', metavar="", help="Method for hierarchical clustering (single/complete/average/weighted/centroid/median/ward) (default: complete)", choices=["single", "complete", "average", "weighted", "centroid", "median", "ward"], default="complete") + args = add_logger_args(args) + + return(parser) + +def overlap_sites(f): + """ overlap bed sites from file """ + + sites = RegionList().from_bed(f) + overlap = sites.count_overlaps() + + return(overlap) + +############################################################################################# + +def run_clustering(args): + + #Create logger + logger = TobiasLogger("ClusterTFBS", args.verbosity) + logger.begin() + + parser = add_clustering_arguments(argparse.ArgumentParser()) + logger.arguments_overview(parser, args) + + #Output files + #output_files = "" + #logger.output_files(output_files) + + #Check input + check_required(args, ["bedfiles"]) #Check input arguments + make_directory(args.outdir) #Check output directory; create if needed + + #----------------------------------------------------------# + + #Join all sites + logger.info("Joining and sorting all sites") + + #tempprefix + temp_prefix = next(tempfile._get_candidate_names()) + logger.debug("Temp prefix: {0}".format(temp_prefix)) + + joined_f = os.path.join(args.outdir, "{0}.tmp".format(temp_prefix)) + cmd = "cat {0} | cut -f 1-4 | sort -k1,1 -k2,2n > {1}".format(" ".join(args.bedfiles), joined_f) + os.system(cmd) + + logger.info("Splitting sites per chromosome") + handles = {} + splitted = [] + with open(joined_f) as f: + for line in f: + + columns = line.split("\t") + chrom = columns[0] + + if chrom not in handles: + logger.debug("Opening handle for chrom {0}".format(chrom)) + split_f = os.path.join(args.outdir, "{0}_{1}.tmp".format(temp_prefix, chrom)) + splitted.append(split_f) + handles[chrom] = open(split_f, "w") + + handles[chrom].write(line) + + logger.debug("Closing handles") + for chrom in handles: + handles[chrom].close() + + #Overlap sites + logger.info("Overlapping sites within subsets") + results = run_parallel(overlap_sites, splitted, [], args.cores, logger) + + #Join results from individual overlaps + logger.info("Joining results from subsets") + global_overlap = merge_dicts(results) + + + #----------------- Create distance matrix -------------------# + + clustering = RegionCluster(global_overlap) + logger.stats("Found {0} unique IDS".format(clustering.n)) + + clustering.cluster(threshold=args.threshold, method=args.method) + + #Write out distance mat + outfile = os.path.join(args.outdir, args.prefix + "_distance_matrix.txt") + clustering.write_distance_mat(outfile) + + #Write out clusters + outfile = open(os.path.join(args.outdir, args.prefix + "_clusters.txt"), "w") + outfile.write("#cluster_name\tcluster_size\tcluster_members\n") + + i = 1 + for cluster in clustering.clusters: + members = clustering.clusters[cluster]["member_names"] + outfile.write("C_{0}\t{1}\t{2}\n".format(i, len(members), ", ".join(members))) + + outfile.close() + + #--------------------------- Plot dendrogram -----------------------------# + + fig, ax = plt.subplots(figsize = (3,max(10,len(IDS)/10))) #5 rows per one column width + dendro_dat = dendrogram(clustering.linkage_mat, labels=clustering.names, orientation="right", ax=ax, above_threshold_color="black", link_color_func=lambda k: clustering.node_color[k]) + + figure_f = os.path.join(args.outdir, args.prefix + "_dendrogram.pdf") + fig.savefig(figure_f, bbox_inches='tight') + logger.info("Plotted dendrogram to: {0}".format(figure_f)) + + #---------------------- Remove tempfiles and end -------------------------# + + for fil in [joined_f] + splitted: + logger.debug("Removing {0}".format(fil)) + try: + os.remove(fil) + except: + logger.warning("Could not remove tempfile {0}".format(fil)) + + logger.end() + +#------------------------------------------------------------------------------------------# + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser = add_clustering_arguments(parser) + args = parser.parse_args() + + if len(sys.argv[1:]) == 0: + parser.print_help() + sys.exit() + + run_clustering(args) + diff --git a/tobias/motifs/format_motifs.py b/tobias/motifs/format_motifs.py index 3ed89ca..085a049 100644 --- a/tobias/motifs/format_motifs.py +++ b/tobias/motifs/format_motifs.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Format (convert/join/split) motifs +Utility to format (convert/join/split) motifs between pfm/jaspar/meme formats @author: Mette Bentsen @contact: mette.bentsen (at) mpi-bn.mpg.de @@ -17,6 +17,7 @@ #Internal functions/classes from tobias.utils.motifs import * from tobias.utils.utilities import * +from tobias.utils.logger import * #--------------------------------------------------------------------------------------------------------# def add_formatmotifs_arguments(parser): @@ -34,6 +35,9 @@ def add_formatmotifs_arguments(parser): required.add_argument('--format', metavar="", help="Desired motif output format (pfm, jaspar, meme) (default: \"jaspar\")", choices=["pfm", "jaspar", "meme"], default="jaspar") required.add_argument('--task', metavar="", help="Which task to perform on motif files (join/split) (default: join)", choices=["join", "split"], default="join") required.add_argument('--output', metavar="", help="If task == join, output is the joined output file; if task == split, output is a directory") + + additional = parser.add_argument_group('Additional arguments') + additional = add_logger_args(additional) return(parser) @@ -44,14 +48,15 @@ def run_formatmotifs(args): check_required(args, ["input", "output"]) #Check input arguments check_files(args.input) #Check if files exist - # Create logger and write argument overview - logger = create_logger() + logger = TobiasLogger("FormatMotifs", args.verbosity) + logger.begin() + parser = add_formatmotifs_arguments(argparse.ArgumentParser()) - logger.comment(arguments_overview(parser, args)) + logger.arguments_overview(parser, args) + logger.output_files([args.output]) - ### Getting ready - logger.comment("") + ####### Getting ready ####### if args.task == "split": logger.info("Making directory {0} if not existing".format(args.output)) @@ -81,7 +86,7 @@ def run_formatmotifs(args): meme_header += "Background letter frequencies\nA 0.25 C 0.25 G 0.25 T 0.25\n\n" if args.task == "split": - logger.info("Writing individual files to directory {0}\n".format(args.output)) + logger.info("Writing individual files to directory {0}".format(args.output)) #Split on ">" if args.format == "jaspar" or args.format == "pfm": @@ -110,17 +115,15 @@ def run_formatmotifs(args): f_out.close() - elif args.task == "join": - logger.info("Writing converted motifs to file {0}\n".format(args.output)) + logger.info("Writing converted motifs to file {0}".format(args.output)) if args.format == "meme": converted_content = meme_header + converted_content out_f.write(converted_content + "\n") - - logger.info("FormatMotifs done!") + logger.end() #--------------------------------------------------------------------------------------------------------# diff --git a/tobias/utils/score_bed.py b/tobias/motifs/score_bed.py similarity index 85% rename from tobias/utils/score_bed.py rename to tobias/motifs/score_bed.py index 8020ef7..cc1b7d5 100644 --- a/tobias/utils/score_bed.py +++ b/tobias/motifs/score_bed.py @@ -1,6 +1,8 @@ #!/usr/bin/env python """ +Scorebed: Scores a bedfile with signal from bigwig file(s) + @author: Mette Bentsen @contact: mette.bentsen (at) mpi-bn.mpg.de @license: MIT @@ -24,6 +26,7 @@ #Utils from TOBIAS from tobias.utils.regions import * from tobias.utils.utilities import * +from tobias.utils.logger import * #-------------------------------------------------------------------------------------------# #-------------------------------- Command line arguments -----------------------------------# @@ -32,8 +35,8 @@ def add_scorebed_arguments(parser): parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) - description = "" - parser.description = format_help_description("ScoreBed", description="") + description = "ScoreBed is a utility to score .bed-file regions with values from a .bigwig-file. The output is a .bed-file with the bigwig value(s) as extra column(s). Options --position and --math can be used to adjust scoring scheme." + parser.description = format_help_description("ScoreBed", description) parser._action_groups.pop() #pop -h @@ -41,8 +44,7 @@ def add_scorebed_arguments(parser): required = parser.add_argument_group('Required arguments') required.add_argument('--bed', metavar="", help="Sites to score (.bed file)") required.add_argument('--bigwigs', metavar="", nargs="*", help="Scores to assign to regions in .bed (.bw file(s))") - #required.add_argument('--output', metavar="", help="Path to output .bed-file") - + #Optional arguments optional = parser.add_argument_group('Optional arguments') optional.add_argument('--output', metavar="", help="Path to output .bed-file (default: scored sites are written to stdout") @@ -50,6 +52,7 @@ def add_scorebed_arguments(parser): optional.add_argument('--null', metavar="", help="If --subset is given, which score/label to add to non-scored regions (default: 0)", default="0", type=float) optional.add_argument('--position', metavar="", help="Position in sites to score (start/mid/end/full) (default: full)", choices=["mid", "start", "end", "full"], default="full") optional.add_argument('--math', metavar="", help="If position == full, choose math to perform on signal (min/max/mean/sum) (default: mean)", choices=["min", "max", "mean", "sum"], default="mean") + optional = add_logger_args(optional) #optional.add_argument('--buffer', metavar="", help="Lines to buffer before writing (default: 10000)", type=int, default=10000) return(parser) @@ -82,8 +85,19 @@ def get_score_func(args): #--------------------------------------------------------------------------------# def run_scorebed(args): - begin_time = datetime.now() + #Verbosity is 0 if output is written to stdout + if args.output == None: + args.verbosity = 0 + + #Start logger + logger = TobiasLogger("ScoreBed", args.verbosity) + logger.begin() + parser = add_scorebed_arguments(argparse.ArgumentParser()) + logger.arguments_overview(parser, args) + logger.output_files([args.output]) + + #Check input check_required(args, ["bed", "bigwigs"]) check_files([getattr(args, arg) for arg in ["bed", "bigwigs", "subset"]], "r") check_files([getattr(args, "output")], "w") @@ -94,12 +108,13 @@ def run_scorebed(args): no_bigwigs = len(args.bigwigs) + #----------------------------------------------------------------------# #----------------------- Overlap bed if needed ------------------------# #----------------------------------------------------------------------# if args.subset != None: - #print("Overlapping {0} to regions in {1}".format(args.bed, args.subset)) + logger.info("Overlapping {0} to regions in {1}".format(args.bed, args.subset)) #Make overlap pb_bed = pb.BedTool(args.bed) @@ -110,16 +125,14 @@ def run_scorebed(args): # Setup score function score_func = get_score_func(args) + #----------------------------------------------------------------------# #---------------------- Open bw and run scoring -----------------------# #----------------------------------------------------------------------# pybw = {bigwig_f: pyBigWig.open(bigwig_f) for bigwig_f in args.bigwigs} #open filehandles for all bigwigs - #check bases? - #for bigwig_f - #'nBasesCovered': 0 - + logger.info("Starting scoring...") #buff = [""]*args.buffer #buff_idx = 0 count = 0 @@ -160,15 +173,10 @@ def run_scorebed(args): # buff = [""]*args.buffer # buff_idx = 0 - #fout.write("\n".join([line for line in buff if line != ""]) + "\n") #write out remaining lines in buffer - for bigwig_f in pybw: pybw[bigwig_f].close() - - end_time = datetime.now() - #print("Finished scoring bed in {0}".format(end_time - begin_time)) - - + + logger.end() #--------------------------------------------------------------------------------------------------------# if __name__ == '__main__': diff --git a/tobias/motifs/tfbscan.py b/tobias/motifs/tfbscan.py index b6275fc..6196fb4 100644 --- a/tobias/motifs/tfbscan.py +++ b/tobias/motifs/tfbscan.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """ TFBScan.py scans for positions of transcription factor binding sites across the genome @@ -14,11 +16,12 @@ import multiprocessing as mp import itertools -import pysam +import pysam #for reading fastafile from tobias.utils.utilities import * from tobias.utils.regions import * from tobias.utils.sequences import * from tobias.utils.motifs import * +from tobias.utils.logger import * #----------------------------------------------------------------------------------------------------------# def add_tfbscan_arguments(parser): @@ -45,12 +48,13 @@ def add_tfbscan_arguments(parser): optional_arguments.add_argument('--naming', metavar="", help="Naming convention for bed-ids and output files ('id', 'name', 'name_id', 'id_name') (default: 'name_id')", choices=["id", "name", "name_id", "id_name"], default="name_id") optional_arguments.add_argument('--gc', metavar="", type=lambda x: restricted_float(x,0,1), help='Set the gc content for background regions (default: will be estimated from fasta)') optional_arguments.add_argument('--pvalue', metavar="", type=lambda x: restricted_float(x,0,1), help='Set p-value for motif matches (default: 0.0001)', default=0.0001) - optional_arguments.add_argument('--keep_overlaps', action='store_true', help='Keep overlaps of same motifs (default: overlaps are resolved by keeping best-scoring site)') + optional_arguments.add_argument('--keep-overlaps', action='store_true', help='Keep overlaps of same motifs (default: overlaps are resolved by keeping best-scoring site)') - optional_arguments.add_argument('--split', metavar="", type=int, help="Split of multiprocessing jobs (default: 100)", default=100) - optional_arguments.add_argument('--cores', metavar="", type=int, help='Number of cores to use (default: 1)', default=1) - optional_arguments.add_argument('--log', metavar="", help="Path to logfile (default: writes to stdout)") - optional_arguments.add_argument('--debug', action="store_true", help=argparse.SUPPRESS) + RUN = parser.add_argument_group('Run arguments') + RUN.add_argument('--split', metavar="", type=int, help="Split of multiprocessing jobs (default: 100)", default=100) + RUN.add_argument('--cores', metavar="", type=int, help='Number of cores to use (default: 1)', default=1) + RUN.add_argument('--debug', action="store_true", help=argparse.SUPPRESS) + RUN = add_logger_args(optional_arguments) return(parser) @@ -120,8 +124,6 @@ def process_TFBS(infile, args): #----------------------------------------------------------------------------------------------------------# def run_tfbscan(args): - begin_time = datetime.now() - ###### Check input arguments ###### check_required(args, ["motifs", "fasta"]) #Check input arguments check_files([args.motifs, args.fasta, args.regions]) #Check if files exist @@ -135,27 +137,24 @@ def run_tfbscan(args): elif args.outdir == None and args.outfile != None: #Joined file check_files([args.outfile], "w") - ###### Create logger and write argument overview ###### - if args.debug: - logger = create_logger(3, args.log) - else: - logger = create_logger(2, args.log) - logger.comment("#TOBIAS TFBScan (run started {0})\n".format(begin_time)) - logger.comment("#Command line call: {0}\n".format(" ".join(sys.argv))) - + ###### Create logger and write argument overview ###### + logger = TobiasLogger("TFBScan", args.verbosity) + logger.begin() parser = add_tfbscan_arguments(argparse.ArgumentParser()) - logger.comment(arguments_overview(parser, args)) + logger.arguments_overview(parser, args) + logger.output_files([args.outfile]) + ######## Read sequences from file and estimate background gc ######## - logger.critical("Handling input files") + logger.info("Handling input files") logger.info("Reading sequences from fasta") fastafile = pysam.FastaFile(args.fasta) fasta_chrom_info = dict(zip(fastafile.references, fastafile.lengths)) fastafile.close() - logger.info("- Found {0} sequences".format(len(fasta_chrom_info))) + logger.stats("- Found {0} sequences in fasta".format(len(fasta_chrom_info))) #Create regions available in fasta logger.info("Setting up regions") @@ -178,6 +177,7 @@ def run_tfbscan(args): if args.gc == None: logger.info("Estimating GC content from fasta (set --gc to skip this step)") args.gc = get_gc_content(regions, args.fasta) + logger.info("- GC content: {0}".format(round(args.gc, 5))) bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0]) @@ -186,18 +186,21 @@ def run_tfbscan(args): #################### Read motifs from file #################### + logger.info("Reading motifs from file") motif_content = open(args.motifs).read() converted_content = convert_motif(motif_content, "pfm") motif_list = pfm_to_motifs(converted_content) #List of OneMotif objects - logger.info("- Found {0} motifs".format(len(motif_list))) + logger.stats("- Found {0} motifs".format(len(motif_list))) logger.debug("Getting motifs ready") motif_list.bg = bg motif_names = [motif.name for motif in motif_list] - motif_list.extend([motif.get_reverse() for motif in motif_list]) + + reverse_motifs = [motif.get_reverse() for motif in motif_list] + motif_list.extend(reverse_motifs) for motif in motif_list: #now with reverse motifs as well motif.set_name(args.naming) motif.name = filafy(motif.name) #remove ()/: etc. which will create problems in filenames @@ -206,9 +209,11 @@ def run_tfbscan(args): motif_names = list(set([motif.name for motif in motif_list])) + #Calculate scanning-threshold for each motif pool = mp.Pool(processes=args.cores) outlist = pool.starmap(OneMotif.get_threshold, itertools.product(motif_list, [args.pvalue])) motif_list = MotifList(outlist) + pool.close() pool.join() @@ -216,7 +221,7 @@ def run_tfbscan(args): #################### Find TFBS in regions ##################### logger.comment("") - logger.critical("Scanning for TFBS with all motifs") + logger.info("Scanning for TFBS with all motifs") manager = mp.Manager() @@ -239,6 +244,11 @@ def run_tfbscan(args): qs = {} TF_names_chunks = [motif_names[i::writer_cores] for i in range(writer_cores)] for TF_names_sub in TF_names_chunks: + + #Skip over any empty chunks + if len(TF_names_sub) == 0: + continue + logger.debug("Creating writer queue for {0}".format(TF_names_sub)) if args.outdir != None: @@ -262,16 +272,17 @@ def run_tfbscan(args): input_arguments = [(chunk, args, motif_list) for chunk in region_chunks] task_list = [worker_pool.apply_async(motif_scanning, (chunk, args, motif_list, )) for chunk in region_chunks] monitor_progress(task_list, logger) - results = [task.get() for task in task_list] + results = [task.get() for task in task_list] #1s #Wait for files to write for TF in qs: qs[TF].put((None, None)) - writer_pool.join() + + writer_pool.join() #Process each file output and write out logger.comment("") - logger.critical("Processing results from scanning") + logger.info("Processing results from scanning") logger.debug("Running processing for files: {0}".format(temp_files)) task_list = [worker_pool.apply_async(process_TFBS, (file, args)) for file in temp_files] worker_pool.close() @@ -283,11 +294,7 @@ def run_tfbscan(args): worker_pool.join() writer_pool.join() - end_time = datetime.now() - logger.comment("") - logger.critical("Finished TFBScan run (total time of {0})".format(end_time - begin_time)) - - + logger.end() #----------------------------------------------------------------------------------------------------------# if __name__ == "__main__": diff --git a/tobias/plotting/plot_aggregate.py b/tobias/plotting/plot_aggregate.py index 65a1742..4e24ff1 100644 --- a/tobias/plotting/plot_aggregate.py +++ b/tobias/plotting/plot_aggregate.py @@ -11,19 +11,25 @@ import os import sys import argparse -import pyBigWig import logging import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from sklearn import preprocessing -import pybedtools as pb + import itertools from datetime import datetime +import scipy +import sklearn +#Bio-stuff +import pyBigWig +import pybedtools as pb + +#Internal classes from tobias.utils.utilities import * from tobias.utils.regions import * -#from footprinting.signals import * +from tobias.utils.logger import * def add_aggregate_arguments(parser): @@ -39,29 +45,26 @@ def add_aggregate_arguments(parser): IO.add_argument('--TFBS', metavar="", nargs="*", help="TFBS sites (*required)") IO.add_argument('--signals', metavar="", nargs="*", help="Signals in bigwig format (*required)") IO.add_argument('--regions', metavar="", nargs="*", help="Regions to overlap with TFBS", default=[]) - IO.add_argument('--whitelist', metavar="", help="Only plot sites within whitelist (.bed)") - IO.add_argument('--blacklist', metavar="", help="Exclude sites within blacklist (.bed)") + IO.add_argument('--whitelist', metavar="", nargs="*", help="Only plot sites within whitelist (.bed)", default=[]) + IO.add_argument('--blacklist', metavar="", nargs="*", help="Exclude sites within blacklist (.bed)", default=[]) IO.add_argument('--output', metavar="", help="Path to output (default: TOBIAS_aggregate.pdf)", default="TOBIAS_aggregate.pdf") - OPT = parser.add_argument_group('Plot options') - OPT.add_argument('--negate', action='store_true', help="Negate overlap with regions") - OPT.add_argument('--title', metavar="", help="Title of plot", default="Aggregated signals") - OPT.add_argument('--width', metavar="", help="", type=int, default=150) - OPT.add_argument('--region_names', metavar="", nargs="*") - OPT.add_argument('--signal_names', metavar="", nargs="*") - OPT.add_argument('--share_y', metavar="", help="Share y-axis range across plots (none/rows/cols/both) (default: none)", choices=["none", "rows", "cols", "both"], default="none") - #OPT.add_argument('--normalize') - - #OPT.add_argument('--norm_signals') - OPT.add_argument('--log_transform', help="", action="store_true") - OPT.add_argument('--norm_regions', help="Normalize aggregate across regions", action='store_true') - OPT.add_argument('--norm_signals', help="Normalize aggregate across signals", action='store_true') - OPT.add_argument('--plot_boundaries', help="Plot TFBS boundaries", action='store_true') - #todo:negate regions - - #RUN = parser.add_argument_group("Run options") - #RUN.add_argument('-l', '--log', metavar="", help="Full path of logfile (default: logfile is written to stdout)") - #RUN.add_argument('--silent', help="Prevents info printing to stdout", action='store_true') + PLOT = parser.add_argument_group('Plot arguments') + PLOT.add_argument('--title', metavar="", help="Title of plot (default: \"Aggregated signals\")", default="Aggregated signals") + PLOT.add_argument('--flank', metavar="", help="Flanking basepairs (+/-) to show in plot (counted from middle of motif) (default: 60)", default="60", type=int) + PLOT.add_argument('--TFBS_labels', metavar="", help="Labels used for each TFBS file (default: prefix of each --TFBS)", nargs="*") + PLOT.add_argument('--signal_labels', metavar="", help="Labels used for each signal file (default: prefix of each --signals)", nargs="*") + PLOT.add_argument('--region_labels', metavar="", help="Labels used for each regions file (default: prefix of each --regions)", nargs="*") + PLOT.add_argument('--share_y', metavar="", help="Share y-axis range across plots (none/signals/sites/both). Use \"--share_y signals\" if bigwig signals have similar ranges. Use \"--share_y sites\" if sites per bigwig are comparable, but bigwigs themselves aren't comparable. (default: none)", choices=["none", "signals", "sites", "both"], default="none") + + #signals / regions + PLOT.add_argument('--norm_comparisons', action='store_true', help="Normalize the aggregate signal in comparison column/row to the same range (default: the true range is shown)") + PLOT.add_argument('--negate', action='store_true', help="Negate overlap with regions") + PLOT.add_argument('--log_transform', help="", action="store_true") + PLOT.add_argument('--plot_boundaries', help="Plot TFBS boundaries", action='store_true') + + RUN = parser.add_argument_group("Run arguments") + RUN = add_logger_args(RUN) return(parser) @@ -69,7 +72,16 @@ def add_aggregate_arguments(parser): def run_aggregate(args): """ Function to make aggregate plot given input from args """ - begin_time = datetime.now() + ######################################################################################### + ############################## Setup logger/input/output ################################ + ######################################################################################### + + logger = TobiasLogger("PlotAggregate", args.verbosity) + logger.begin() + + parser = add_aggregate_arguments(argparse.ArgumentParser()) + logger.arguments_overview(parser, args) + logger.output_files([args.output]) #Check input parameters check_required(args, ["TFBS", "signals"]) @@ -77,45 +89,31 @@ def run_aggregate(args): check_files([args.output], action="w") #### Test input #### - if args.region_names != None and (len(args.regions) != len(args.region_names)): - print("ERROR: --regions and --region_names have different lengths") - #print(len(args.regions)) + if args.TFBS_labels != None and (len(args.TFBS) != len(args.TFBS_labels)): + logger.error("ERROR --TFBS and --TFBS_labels have different lengths ({0} vs. {1})".format(len(args.TFBS), len(args.TFBS_labels))) + if args.region_labels != None and (len(args.regions) != len(args.region_labels)): + logger.error("ERROR: --regions and --region_labels have different lengths ({0} vs. {1})".format(len(args.regions), len(args.region_labels))) sys.exit() - if args.signal_names != None and (len(args.signals) != len(args.signal_names)): - print("ERROR: --signals and --signal_names have different lengths.") + if args.signal_labels != None and (len(args.signals) != len(args.signal_labels)): + logger.error("ERROR: --signals and --signal_labels have different lengths ({0} vs. {1})".format(len(args.signals), len(args.signal_labels))) sys.exit() - #### Format input #### - args.TFBS = [os.path.abspath(f) for f in args.TFBS] - args.regions = [os.path.abspath(f) for f in args.regions] - args.signals = [os.path.abspath(f) for f in args.signals] - args.TFBS_names = [os.path.splitext(os.path.basename(f))[0] for f in args.TFBS] - args.region_names = [os.path.splitext(os.path.basename(f))[0] for f in args.regions] if args.region_names == None else args.region_names - args.signal_names = [os.path.splitext(os.path.basename(f))[0] for f in args.signals] if args.signal_names == None else args.signal_names - args.output = os.path.abspath(args.output) - + #args.TFBS = [os.path.abspath(f) for f in args.TFBS] + #args.regions = [os.path.abspath(f) for f in args.regions] + #args.signals = [os.path.abspath(f) for f in args.signals] + args.TFBS_labels = [os.path.splitext(os.path.basename(f))[0] for f in args.TFBS] if args.TFBS_labels == None else args.TFBS_labels + args.region_labels = [os.path.splitext(os.path.basename(f))[0] for f in args.regions] if args.region_labels == None else args.region_labels + args.signal_labels = [os.path.splitext(os.path.basename(f))[0] for f in args.signals] if args.signal_labels == None else args.signal_labels + #args.output = os.path.abspath(args.output) - ######################################################################################### - ##################################### Logger info ####################################### - ######################################################################################### - - # Create logger - logger = create_logger() - - #Print info on run - logger.comment("#TOBIAS PlotAggregate (run started {0})\n".format(begin_time)) - logger.comment("#Command line call: {0}\n".format(" ".join(sys.argv))) - - parser = add_aggregate_arguments(argparse.ArgumentParser()) - logger.comment(arguments_overview(parser, args)) - ######################################################################################### ############################ Get input regions/signals ready ############################ ######################################################################################### - logger.info("Reading information from input files") + logger.info("---- Processing input ----") + logger.info("Reading information from .bed-files") #Make combinations of TFBS / regions column_names = [] @@ -124,7 +122,7 @@ def run_aggregate(args): logger.info("Overlapping sites to --regions") regions_dict = {} - combis = itertools.product(range(len(args.TFBS)),range(len(args.regions))) + combis = itertools.product(range(len(args.TFBS)), range(len(args.regions))) for (i,j) in combis: TFBS_f = args.TFBS[i] region_f = args.regions[j] @@ -132,46 +130,54 @@ def run_aggregate(args): #Make overlap pb_tfbs = pb.BedTool(TFBS_f) pb_region = pb.BedTool(region_f) + #todo: write out lengths + overlap = pb_tfbs.intersect(pb_region, u=True) + #todo: length after overlap - name = args.TFBS_names[i] + " " + args.region_names[j] + name = args.TFBS_labels[i] + " " + args.region_labels[j] #name for column column_names.append(name) regions_dict[name] = RegionList().from_bed(overlap.fn) if args.negate == True: overlap_neg = pb_tfbs.intersect(pb_region, v=True) - name = args.TFBS_names[i] + " " + args.region_names[j] + name = args.TFBS_labels[i] + " " + args.region_labels[j] column_names.append(name) regions_dict[name] = RegionList().from_bed(overlap_neg.fn) else: - column_names = args.TFBS_names - regions_dict = {args.TFBS_names[i]: RegionList().from_bed(args.TFBS[i]) for i in range(len(args.TFBS))} + column_names = args.TFBS_labels + regions_dict = {args.TFBS_labels[i]: RegionList().from_bed(args.TFBS[i]) for i in range(len(args.TFBS))} + + for name in regions_dict: + logger.stats("COUNT {0}: {1} sites".format(name, len(regions_dict[name]))) #length of RegionList obj #-------- Do overlap of regions if whitelist / blacklist -------# - if args.whitelist != None or args.blacklist != None: - logger.info("Subsetting regions...") + if len(args.whitelist) > 0 or len(args.blacklist) > 0: + logger.info("Subsetting regions on whitelist/blacklist") for regions_id in regions_dict: sites = pb.BedTool(regions_dict[regions_id].as_bed(), from_string=True) - logger.info("Found {0} sites in {1}".format(len(regions_dict[regions_id]), regions_id)) + logger.stats("Found {0} sites in {1}".format(len(regions_dict[regions_id]), regions_id)) - if args.whitelist != None: - whitelist = pb.BedTool(args.whitelist) - sites_tmp = sites.intersect(whitelist, u = True) - sites = sites_tmp - logger.info("Overlapped to whitelist -> {0}".format(len(sites))) - - if args.blacklist != None: - blacklist = pb.BedTool(args.blacklist) - sites_tmp = sites.intersect(blacklist, v = True) - sites = sites_tmp - logger.info("Removed blacklist -> {0}".format(format(len(sites)))) + if len(args.whitelist) > 0: + for whitelist_f in args.whitelist: + whitelist = pb.BedTool(whitelist_f) + sites_tmp = sites.intersect(whitelist, u = True) + sites = sites_tmp + logger.stats("Overlapped to whitelist -> {0}".format(len(sites))) + + if len(args.blacklist) > 0: + for blacklist_f in args.blacklist: + blacklist = pb.BedTool(blacklist_f) + sites_tmp = sites.intersect(blacklist, v = True) + sites = sites_tmp + logger.stats("Removed blacklist -> {0}".format(format(len(sites)))) regions_dict[regions_id] = RegionList().from_bed(sites.fn) - # Estimate motif width + # Estimate motif width per region site_list = regions_dict[list(regions_dict.keys())[0]] if len(site_list) > 0: motif_width = site_list[0].get_width() @@ -179,43 +185,122 @@ def run_aggregate(args): motif_width = 0 # Set width (centered on mid) + args.width = args.flank*2 for regions_id in regions_dict: regions_dict[regions_id].apply_method(OneRegion.set_width, args.width) - # Print number of sites - for regions_id in regions_dict: - logger.info("{0}: {1} sites".format(regions_id, len(regions_dict[regions_id]))) - - ######################################################################################### ############################ Read signal for bigwig per site ############################ ######################################################################################### + logger.info("Reading signal from bigwigs") + signal_dict = {} for i, signal_f in enumerate(args.signals): - signal_name = args.signal_names[i] + signal_name = args.signal_labels[i] signal_dict[signal_name] = {} #Open pybw to read signal pybw = pyBigWig.open(signal_f, "rb") - logger.info("Reading signal from {0}".format(signal_name)) + logger.info("- Reading signal from {0}".format(signal_name)) for regions_id in regions_dict: for one_region in regions_dict[regions_id]: tup = one_region.tup() #(chr, start, end, strand) - if tup not in signal_dict[signal_name]: + if tup not in signal_dict[signal_name]: #only get signal if it was not already read previously signal_dict[signal_name][tup] = one_region.get_signal(pybw) #returns signal pybw.close() + ######################################################################################### + ################################## Calculate aggregates ################################# + ######################################################################################### + + logger.comment("") + logger.info("---- Analysis ----") + + #Calculate aggregate per signal/region comparison + logger.info("Calculating aggregate signals") + aggregate_dict = {signal_name:{region_name: [] for region_name in regions_dict} for signal_name in args.signal_labels} + for row, signal_name in enumerate(args.signal_labels): + for col, region_name in enumerate(column_names): + + signalmat = np.array([signal_dict[signal_name][reg.tup()] for reg in regions_dict[region_name]]) + + #todo: Option to remove rows with lowest/highest signal (--percentiles?) + #signal_min, signal_max = np.percentile(signalmat, [1,99]) + #signalmat[signalmat < signal_min] = signal_min + #signalmat[signalmat > signal_max] = signal_max + + if args.log_transform: + signalmat_abs = np.abs(signalmat) + signalmat_log = np.log2(signalmat_abs + 1) + signalmat_log[signalmat < 0] *= -1 #original negatives back to <0 + signalmat = signalmat_log + + aggregate = np.nanmean(signalmat, axis=0) + aggregate_dict[signal_name][region_name] = aggregate + + #Measure of footprint depth in comparison to baseline + logger.info("Calculating footprint depth measure") + logger.info("FPD (signal,regions): footprint_width baseline middle FPD") + for row, signal_name in enumerate(args.signal_labels): + for col, region_name in enumerate(column_names): + + agg = aggregate_dict[signal_name][region_name] + + #Estimation of possible footprint width + FPD_results = [] + for fp_flank in range(int(motif_width/2), min([25, args.flank])): #motif width for this bed + + #Baseline level + baseline_indices = list(range(0,args.flank-fp_flank)) + list(range(args.flank+fp_flank, len(agg))) + baseline = np.median(agg[baseline_indices]) + + #Footprint level + middle_indices = list(range(args.flank-fp_flank, args.flank+fp_flank)) + middle = np.mean(agg[middle_indices]) #within the motif + + #Footprint depth + depth = middle - baseline + FPD_results.append([fp_flank*2, baseline, middle, depth]) + + #Estimation of possible footprint width + all_fpds = [result[-1] for result in FPD_results] + FPD_results_best = FPD_results #[result + [" "] if result[-1] != min(all_fpds) else result + ["*"] for result in FPD_results] + + for result in FPD_results_best: + logger.stats("FPD ({0},{1}): {2} {3:.3f} {4:.3f} {5:.3f}".format(signal_name, region_name, result[0], result[1], result[2], result[3])) + + #Compare pairwise to calculate chance of footprint + logger.comment("") + logger.info("Calculating pairwise aggregate pearson correlation") + logger.info("CORRELATION (signal1,region1) VS (signal2,region2): PEARSONR") + plots = itertools.product(args.signal_labels, column_names) + combis = itertools.combinations(plots, 2) + + for ax1, ax2 in combis: + + signal1, region1 = ax1 + signal2, region2 = ax2 + agg1 = aggregate_dict[signal1][region1] + agg2 = aggregate_dict[signal2][region2] + + pearsonr, pval = scipy.stats.pearsonr(agg1, agg2) + + logger.stats("CORRELATION ({0},{1}) VS ({2},{3}): {4:.5f}".format(signal1, region1, signal2, region2, pearsonr)) + + ######################################################################################### ################################ Set up plotting grid ################################### ######################################################################################### + logger.comment("") + logger.info("---- Plotting aggregates ----") logger.info("Setting up plotting grid") no_rows = len(args.signals) + 1 if len(args.signals) > 1 else len(args.signals) @@ -231,7 +316,7 @@ def run_aggregate(args): #Title of plot and grid plt.suptitle(args.title, fontsize=16) - row_names = args.signal_names + ["Comparison"] if row_compare else args.signal_names + row_names = args.signal_labels + ["Comparison"] if row_compare else args.signal_labels col_names = column_names + ["Comparison"] if col_compare else column_names for col in range(no_cols): @@ -274,11 +359,12 @@ def run_aggregate(args): plt.subplots_adjust(hspace=0.3, top=0.93) + ######################################################################################### ############################## Fill in with bigwig scores ############################### ######################################################################################### - for row, signal_name in enumerate(args.signal_names): + for row, signal_name in enumerate(args.signal_labels): for col, region_name in enumerate(column_names): logger.info("Plotting regions {0} from signal {1}".format(region_name, signal_name)) @@ -287,29 +373,21 @@ def run_aggregate(args): if len(regions_dict[region_name]) > 0: #Signal in region - signalmat = np.array([signal_dict[signal_name][reg.tup()] for reg in regions_dict[region_name]]) - - if args.log_transform: - signalmat_abs = np.abs(signalmat) - signalmat_log = np.log2(signalmat_abs + 1) - signalmat_log[signalmat < 0] *= -1 #original negatives back to <0 - signalmat = signalmat_log - - aggregate = np.nanmean(signalmat, axis=0) + aggregate = aggregate_dict[signal_name][region_name] axarr[row, col].plot(xvals, aggregate, color=colors[col+row], linewidth=1, label=signal_name) aggregate_norm = preprocessing.minmax_scale(aggregate) #Compare across rows and cols if col_compare: #compare between different regions - if args.norm_regions: + if args.norm_comparisons: aggregate_compare = aggregate_norm else: aggregate_compare = aggregate axarr[row, -1].plot(xvals, aggregate_compare, color=colors[row+col], linewidth=1, alpha=0.8, label=region_name) if row_compare: #compare between different bigwigs - if args.norm_signals: + if args.norm_comparisons: aggregate_compare = aggregate_norm else: aggregate_compare = aggregate @@ -324,7 +402,6 @@ def run_aggregate(args): axarr[row, col].text(0.98,0.98,str(len(regions_dict[region_name])), transform = axarr[row,col].transAxes, fontsize=12, va="top", ha="right") - #------------- Finishing up plots ---------------# logger.info("Adjusting final details") @@ -336,7 +413,8 @@ def run_aggregate(args): if args.share_y == "none": pass #Do not set ylim for plots - elif args.share_y == "cols": + #Signals are comparable (for example normalized signal between two conditions) + elif args.share_y == "signals": for col in range(no_cols): lims = np.array([ax.get_ylim() for ax in axarr[:,col] if ax is not None]) ymin, ymax = np.min(lims), np.max(lims) @@ -345,7 +423,8 @@ def run_aggregate(args): if axarr[row, col] is not None: axarr[row, col].set_ylim(ymin, ymax) - elif args.share_y == "rows": + #Regions are comparable (for example bound/unbound) + elif args.share_y == "sites": for row in range(no_rows): lims = np.array([ax.get_ylim() for ax in axarr[row,:] if ax is not None]) ymin, ymax = np.min(lims), np.max(lims) @@ -353,6 +432,7 @@ def run_aggregate(args): if axarr[row, col] is not None: axarr[row, col].set_ylim(ymin, ymax) + #Comparable on both rows/columns elif args.share_y == "both": global_ymin, global_ymax = np.inf, -np.inf for row in range(no_rows): @@ -371,8 +451,7 @@ def run_aggregate(args): plt.savefig(args.output, bbox_inches='tight') plt.close() - logger.info("Finished PlotAggregate! Output file is: {0}".format(os.path.abspath(args.output))) - + logger.end() #--------------------------------------------------------------------------------------------------------# if __name__ == '__main__': diff --git a/tobias/plotting/plot_bindetect.py b/tobias/plotting/plot_bindetect.py deleted file mode 100644 index 78ce347..0000000 --- a/tobias/plotting/plot_bindetect.py +++ /dev/null @@ -1,358 +0,0 @@ -#!/usr/bin/env python - -import os -import sys -import argparse -import numpy as np -import re - -#Clustering -from sklearn.cluster import DBSCAN -import sklearn.preprocessing as preprocessing -from scipy.cluster.hierarchy import dendrogram, linkage, fcluster -from scipy.spatial.distance import squareform - -#Plotting -import matplotlib -from matplotlib import pyplot as plt -import matplotlib.gridspec as gridspec -from matplotlib.backends.backend_pdf import PdfPages -from cycler import cycler -import pandas as pd -from matplotlib.lines import Line2D -from adjustText import adjust_text - -#Internal functions and classes -from tobias.utils.utilities import * - -import warnings - -#---------------------------------------------------------------------------------------------------# -def add_diffplot_arguments(parser): - - parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) - description = "" - parser.description = format_help_description("PlotBINDetect", description) - - parser._action_groups.pop() #pop -h - - args = parser.add_argument_group('Arguments') - args.add_argument('-m', '--matrix', metavar="", help="Distance matrix from TF clustering") - args.add_argument('-b', '--bindetect', metavar="", help="Differential binding scores from bindetect_results.txt file") - args.add_argument('-o', '--output', metavar="", help="(default: diff_bind.pdf)", default="diff_bind.pdf") - - args.add_argument('--cluster_threshold', metavar="", help="Clustering threshold (default: 0.5)", type=float, default=0.5) - args.add_argument('--change_threshold', metavar="", help="Set change threshold (default: None)", type=float, default=None) - args.add_argument('--pvalue_threshold', metavar="", help="Set pvalue threshold (default: set at 5%% percentile)", type=float, default=None) - - return(parser) - - -class Tcluster(): # - - def __init__(self): - - self.similarity_mat = None - self.IDS = None - - #Created by clustering - - def cluster(threshold=0.5): - - self.dist_mat = squareform(similarity_mat) - self.linkage_mat = linkage(self.dist_mat, "average") - self.labels = fcluster(linkage_mat, threshold, criterion="distance") - - - -def cluster_matrix(similarity_mat): - - dist_mat = squareform(similarity_mat) - linkage_mat = linkage(dist_mat, "average") - labels = fcluster(linkage_mat, 0.5, criterion="distance") - - return(linkage_mat, labels, clusters) - -#---------------------------------------------------------------------------------------------------# -def plot_bindetect(IDS, similarity_mat, changes, pvalues, conditions, change_threshold=None, pvalue_threshold=None): - """ Conditions refer to the order of the fold_change divison, meaning condition1/condition2 """ - warnings.filterwarnings("ignore") - - comparison = conditions - IDS = np.array(IDS) - no_IDS = len(IDS) - - names = np.array([name.split("_")[0] for name in IDS]) #without id - - dist_mat = squareform(similarity_mat) - linkage_mat = linkage(dist_mat, "average") - labels = fcluster(linkage_mat, 0.5, criterion="distance") - - diff_scores = {IDS[i]:{"change": changes[i], "pvalue": pvalues[i]} for i in range(len(IDS))} - pvalues = np.array(pvalues) - xvalues = np.array(changes) - - #Find min pvalue that is not zero -> set 0 values to this level - minval = np.min(pvalues[np.nonzero(pvalues)]) - pvalues[pvalues == 0] = minval - yvalues = -np.log10(pvalues) - - if pvalue_threshold is None: - all_but_null = pvalues[pvalues != minval] - if len(all_but_null) > 0: - pvalue_threshold = np.percentile(all_but_null, 5) - else: - pvalue_threshold = 0 - - if change_threshold is None: - all_but_null = np.abs(xvalues[xvalues != 0]) - if len(all_but_null) > 0: - change_threshold = np.percentile(all_but_null, 95) - else: - change_threshold = 0 - - #>>>>>>>>>>> Clustering - #Find clusters below threshold - clusters = dict(zip(range(no_IDS), [[num] for num in range(no_IDS)])) - - threshold = 0.5 - for i, row in enumerate(linkage_mat): - ID1 = int(row[0]) - ID2 = int(row[1]) - new = no_IDS + i - dist = row[2] - - if dist <= threshold: - clusters[new] = clusters[ID1] + clusters[ID2] + [new] - del clusters[ID1] - del clusters[ID2] - - colorlist = ["blue", "green", "red", "orange"] - node_color = ["black"] * (2*no_IDS-1) - i = 0 - for cluster in sorted(list(clusters.keys())): - if len(clusters[cluster]) > 1: - color = colorlist[i] - for node in clusters[cluster]: - node_color[node] = color - i += 1 - - if i == len(colorlist): - i = 0 - - #<<<<<<<<<<< Clustering - - #--------------------------------------- Figure -------------------------------- # - #Make figure - no_rows, no_cols = 2,2 - h_ratios = [1,max(1,no_IDS/25)] - figsize = (8,10+7*(no_IDS/25)) - - fig = plt.figure(figsize = figsize) - gs = gridspec.GridSpec(no_rows, no_cols, height_ratios=h_ratios) - gs.update(hspace=0.0001, bottom=0.00001, top=0.999999) - - ax1 = fig.add_subplot(gs[0,:]) #volcano - ax2 = fig.add_subplot(gs[1,0]) #long scatter overview - ax3 = fig.add_subplot(gs[1,1]) #dendrogram - - ######### Volcano plot on top of differential values ######## - - ax1.set_title("BINDetect volcano plot", fontsize=16, pad=20) - - row, col = 0,0 - labels = IDS - ax1.scatter(xvalues, yvalues, color="black", s=5) - - #Add +/- 10% to make room for labels - ylim = ax1.get_ylim() - y_extra = (ylim[1] - ylim[0]) * 0.1 - ax1.set_ylim(ylim[0], ylim[1] + y_extra) - - xlim = ax1.get_xlim() - x_extra = (xlim[1] - xlim[0]) * 0.1 - lim = np.max([np.abs(xlim[0]-x_extra), np.abs(xlim[1]+x_extra)]) - ax1.set_xlim(-lim, lim) - - x0,x1 = ax1.get_xlim() - y0,y1 = ax1.get_ylim() - ax1.set_aspect((x1-x0)/(y1-y0)) #square volcano plot - - #Plot in pvalue threshold as dotted line - if pvalue_threshold is not 0: - threshold = -np.log10(pvalue_threshold) - ax1.axhline(y=threshold, color="grey", linestyle="--", linewidth=0.5) - ax1.annotate(" {0:.2E}".format(pvalue_threshold), (lim, threshold), horizontalalignment='left',verticalalignment='center', color="grey") - - if change_threshold is not 0: - ax1.axvline(-abs(change_threshold), color="grey", linestyle="--", linewidth=0.5) - ax1.axvline(abs(change_threshold), color="grey", linestyle="--", linewidth=0.5) #Plot thresholds - - #Decorate plot - ax1.set_xlabel("Differential binding score") - ax1.set_ylabel("-log10(pvalue)") - - - ########### Dendrogram over similarities of TFs ####### - #row, col = 1,1 - dendro_dat = dendrogram(linkage_mat, labels=IDS, no_labels=True, orientation="right", ax=ax3, above_threshold_color="black", link_color_func=lambda k: node_color[k]) - labels = dendro_dat["ivl"] - ax3.set_xlabel("Transcription factor similarities\n(Clusters below threshold are colored)") - - ax3.set_ylabel("Transcription factor clustering based on TFBS overlap", rotation=270, labelpad=20) - ax3.yaxis.set_label_position("right") - - #set aspect - x0,x1 = ax3.get_xlim() - y0,y1 = ax3.get_ylim() - - ax3.set_aspect(((x1-x0)/(y1-y0)) * no_IDS/10) #square volcano plot - - ########## Differential binding scores per TF ########## - #row, col = 1,0 - ax2.set_xlabel("Differential binding score\n" + "(" + comparison[1] + r' $\leftarrow$' + r'$\rightarrow$ ' + comparison[0] + ")") #First position in comparison equals numerator in log2fc division - ax2.xaxis.set_label_position('bottom') - ax2.xaxis.set_ticks_position('bottom') - - no_labels = len(labels) - ax2.set_ylim(0.5, no_labels+0.5) - ax2.set_ylabel("Transcription factors") - - ax2.set_yticks(range(1,no_labels+1)) - ax2.set_yticklabels(labels) - ax2.axvline(0) #Plot line at mean - - #Plot scores per TF - colors = [] - for y, TF in enumerate(labels): - - idx = np.where(IDS == TF)[0][0] - score = diff_scores[TF]["change"] - pvalue = diff_scores[TF]["pvalue"] - - #Set size based on pvalue - if pvalue <= pvalue_threshold or score < -change_threshold or score > change_threshold: - fill = "full" - else: - fill = "none" - - ax2.axhline(y+1, color="grey", linewidth=1) - ax2.plot(score, y+1, marker='o', color=node_color[idx], fillstyle=fill) - ax2.yaxis.get_ticklabels()[y].set_color(node_color[idx]) - - #Set x-axis ranges - lim = np.max(np.abs(ax2.get_xlim())) - ax2.set_xlim((-lim, lim)) #center on 0 - - #set aspect - x0,x1 = ax2.get_xlim() - y0,y1 = ax2.get_ylim() - ax2.set_aspect(((x1-x0)/(y1-y0)) * no_IDS/10) #square volcano plot - - plt.tight_layout() #tight layout before setting ids in volcano plot - - # Color points and set labels in volcano - txts = [] - - #points with -fc -> blue - included = np.logical_and(xvalues < 0, np.logical_or(xvalues <= -change_threshold, pvalues <= pvalue_threshold)) - chosen_x, chosen_y, chosen_l = xvalues[included], yvalues[included], names[included] - ax1.scatter(chosen_x, chosen_y, color="blue", s=4.5) - for i, txt in enumerate(chosen_l): - txts.append(ax1.text(chosen_x[i], chosen_y[i], txt, fontsize=7)) - - #points with +fc -> red - included = np.logical_and(xvalues > 0, np.logical_or(xvalues >= change_threshold, pvalues <= pvalue_threshold)) - chosen_x, chosen_y, chosen_l = xvalues[included], yvalues[included], names[included] - ax1.scatter(chosen_x, chosen_y, color="red", s=4.5) - for i, txt in enumerate(chosen_l): - txts.append(ax1.text(chosen_x[i], chosen_y[i], txt, fontsize=7)) - - adjust_text(txts, ax=ax1, text_from_points=True, arrowprops=dict(arrowstyle='-', color='black', lw=0.5)) #, expand_text=(0.1,1.2), expand_objects=(0.1,0.1)) #, arrowprops=dict(arrowstyle='-', color='black', lw=0.5)) - - #Plot custom legend for colors - legend_elements = [Line2D([0],[0], marker='o', color='w', markerfacecolor="red", label="More bound in {0}".format(conditions[0])), - Line2D([0],[0], marker='o', color='w', markerfacecolor="blue", label="More bound in {0}".format(conditions[1]))] - ax1.legend(handles=legend_elements, bbox_to_anchor=(1.05, 1), loc='upper left') - - return(fig) - - -################################################################################################# -################################################################################################# -################################################################################################# - -def run_diffplot(args): - - #Test input - check_required(args, ["matrix", "bindetect"]) - check_files([args.matrix, args.bindetect], "r") - check_files([args.output], "w") - - #------------------------ Read bindetect scores -------------------------# - - #Read table, get number of comparison plots to make - table = pd.read_csv(args.bindetect, sep="\t") - header = list(table) - - comparisons = [col.replace("_change", "") for col in header if col.endswith("change")] #list of "cond1_cond2" pairs - - diff_scores = {} - for comparison in comparisons: - diff_scores[comparison] = {} - for index, row in table.iterrows(): - TF = row["TF_name"] - diff_scores[comparison][TF] = {"change": row[comparison + "_change"], "pvalue": row[comparison + "_pvalue"]} - - #-------------------- Read similarity mat from file ---------------------# - - #read matrix w/header - IDS = open(args.matrix, "r").readline().rstrip().split("\t") - distance_mat = np.loadtxt(args.matrix, skiprows=1) - - #Check dimensions of mat - no_rows, no_columns = distance_mat.shape - if no_columns != no_rows: - print("ERROR: Wrong matrix format (({0},{1}))".format(no_rows, no_columns)) - sys.exit() - - #Check column ids - #column_ids = IDS - no_IDS = len(IDS) - - #Convert mat to numpy - #distance_mat = np.array([[float(element) for element in columns[1:]] for columns in mat[1:]]) - - #---------------------- Subset similarity mat if needed ------------------# - # needed when bindetect file has been subset to only some rows - - #Columns/rows to keep - comparison = list(diff_scores.keys())[0] - diff_ids = list(diff_scores[comparison].keys()) - - to_keep = [i for i in range(no_IDS) if IDS[i] in diff_ids] - - IDS = [IDS[i] for i in to_keep] - no_IDS = len(IDS) - distance_mat = distance_mat[np.array(to_keep),:][:,np.array(to_keep)] #select rows, then columns - - - #----------------------------- Make plots -------------------------------# - - #Set up multipage pdf - figure_pdf = PdfPages(args.output, keep_empty=True) - - #IDS / similarity same across all - for comparison in comparisons: - - changes = [diff_scores[comparison][TF]["change"] for TF in IDS] - pvalues = [diff_scores[comparison][TF]["pvalue"] for TF in IDS] - conditions = comparison.split("_") - print(conditions) - - fig = plot_bindetect(IDS, distance_mat, changes, pvalues, conditions, change_threshold=args.change_threshold, pvalue_threshold=args.pvalue_threshold) - figure_pdf.savefig(fig, bbox_inches='tight') - - figure_pdf.close() - - print("Plot written to: {0}".format(args.output)) \ No newline at end of file diff --git a/tobias/plotting/plot_changes.py b/tobias/plotting/plot_changes.py index 24fddc1..e763a5d 100644 --- a/tobias/plotting/plot_changes.py +++ b/tobias/plotting/plot_changes.py @@ -23,7 +23,7 @@ def add_plotchanges_arguments(parser): parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=35, width=90) - description = "" + description = "PlotChanges is a utility to plot the changes in TF binding across multiple conditions as predicted by TOBIAS BINdetect.\n\n" description += "Example usage:\n$ echo CTCF GATA > TFS.txt\n$ TOBIAS PlotChanges --bindetect --TFS TFS.txt\n\n" parser.description = format_help_description("PlotChanges", description) @@ -32,12 +32,12 @@ def add_plotchanges_arguments(parser): required_arguments = parser.add_argument_group('Required arguments') required_arguments.add_argument('--bindetect', metavar="", help='Bindetect_results.txt file from BINDetect run') - required_arguments.add_argument('--TFS', metavar="", help='Text file containing names of TFs to show in plot (one per line)') # whole genome file or regions of interest in FASTA format to be scanned with motifs') + required_arguments.add_argument('--TFS', metavar="", help='Text file containing names of TFs to show in plot (one per line)') #All other arguments are optional optional_arguments = parser.add_argument_group('Optional arguments') optional_arguments.add_argument('--output', metavar="", help='Output file for plot (default: bindetect_changes.pdf)', default="bindetect_changes.pdf") - optional_arguments.add_argument('--conditions', metavar="", help="Ordered list of conditions to show (default: conditions found in bindetect file)", nargs="*") + optional_arguments.add_argument('--conditions', metavar="", help="Ordered list of conditions to show (default: conditions are ordered as within the bindetect file)", nargs="*") return(parser) @@ -46,13 +46,12 @@ def add_plotchanges_arguments(parser): def run_plotchanges(args): #------------------------------------ Get ready ------------------------------------# - logger = create_logger(2, None) + logger = TobiasLogger("PlotChanges", args.verbosity) check_required(args, ["bindetect", "TFS"]) check_files([args.bindetect, args.TFS], "r") check_files([args.output], "w") - #------------------------------------ Read data ------------------------------------# logger.info("Reading data from bindetect file") @@ -78,7 +77,6 @@ def run_plotchanges(args): chosen_TFS = list(flatten_list(matches)) logger.info("Chosen TFS to view in plot: {0}".format(chosen_TFS)) - # Get order of conditions header = list(table.columns.values) conditions_file = [element.replace("_bound", "") for element in header if "bound" in element] @@ -99,7 +97,6 @@ def run_plotchanges(args): cmap = matplotlib.cm.get_cmap('rainbow') colors = cmap(np.linspace(0,1,len(chosen_TFS))) - fig, ax1 = plt.subplots(figsize=(10,5)) #Make lineplot per TF diff --git a/tobias/plotting/plot_heatmap.py b/tobias/plotting/plot_heatmap.py index efe9173..a1e2f68 100644 --- a/tobias/plotting/plot_heatmap.py +++ b/tobias/plotting/plot_heatmap.py @@ -34,7 +34,7 @@ def add_heatmap_arguments(parser): #---------------- Parse arguments ---------------# parser.formatter_class = lambda prog: argparse.RawDescriptionHelpFormatter(prog, max_help_position=40, width=90) - description = "" + description = "PlotHeatmap plots a heatmap of signals from bigwig(s) (each row is one site) as well as the aggregate signal across all sites." parser.description = format_help_description("PlotHeatmap", description) parser._action_groups.pop() #pop -h @@ -44,20 +44,20 @@ def add_heatmap_arguments(parser): IO.add_argument('--signals', metavar="", nargs="*", help="Signals in bigwig format (*required)") IO.add_argument('--output', metavar="", help="Output filename (default: TOBIAS_heatmap.pdf)", default="TOBIAS_heatmap.pdf") - #IO.add_argument('--log', metavar="", help="",) - #IO.add_argument('--silent', help="Prevents printing to stdout", action='store_true') - - plotargs = parser.add_argument_group('Plot parameters') - plotargs.add_argument('--plot_boundaries', help="Plot TFBS boundaries", action='store_true') - plotargs.add_argument('--share_colorbar', help="Share colorbar across all bigwigs (default: estimate colorbar per bigwig)", action='store_true') - plotargs.add_argument('--flank', metavar="", help="", type=int, default=75) + PLOT = parser.add_argument_group('Plot arguments') + PLOT.add_argument('--plot_boundaries', help="Plot TFBS boundaries", action='store_true') + PLOT.add_argument('--share_colorbar', help="Share colorbar across all bigwigs (default: estimate colorbar per bigwig)", action='store_true') + PLOT.add_argument('--flank', metavar="", help="", type=int, default=75) - plotargs.add_argument('--title', metavar="", default="TOBIAS heatmap") - plotargs.add_argument('--TFBS_labels', metavar="", nargs="*", action='append', help="Labels of TFBS (default: basename of --TFBS)") - plotargs.add_argument('--signal_labels', metavar="", nargs="*", help="Labels of signals (default: basename of --signals)") + PLOT.add_argument('--title', metavar="", default="TOBIAS heatmap") + PLOT.add_argument('--TFBS_labels', metavar="", nargs="*", action='append', help="Labels of TFBS (default: basename of --TFBS)") + PLOT.add_argument('--signal_labels', metavar="", nargs="*", help="Labels of signals (default: basename of --signals)") + + PLOT.add_argument('--show_columns', nargs="*", metavar="", type=int, help="Show scores from TFBS column besides heatmap. Set to 0-based python coordinates (for example -1 for last column) (default: None)", default=[]) + PLOT.add_argument('--sort_by', metavar="", help="Columns in .bed to sort heatmap by (default: input .beds are not sorted)", type=int) - plotargs.add_argument('--show_columns', nargs="*", metavar="", type=int, help="Show scores from TFBS column besides heatmap. Set to 0-based python coordinates (for example -1 for last column) (default: None)", default=[]) - plotargs.add_argument('--sort_by', metavar="", help="Columns in .bed to sort heatmap by (default: input .beds are not sorted)", type=int) + RUN = parser.add_argument_group('Run arguments') + RUN = add_logger_args(RUN) return(parser) @@ -65,7 +65,13 @@ def add_heatmap_arguments(parser): #----------------------------------------------------------------------------------------# def run_heatmap(args): - begin_time = datetime.now() + #Start logger + logger = TobiasLogger("PlotHeatmap", args.verbosity) + logger.begin() + + parser = add_heatmap_arguments(argparse.ArgumentParser()) + logger.arguments_overview(parser, args) + logger.output_files([args.output]) check_required(args, ["TFBS", "signals"]) @@ -77,19 +83,6 @@ def run_heatmap(args): args.signal_labels = [os.path.basename(fil) for fil in args.signals] - ######################################################## - - # Create logger - logger = create_logger() - - #Print info on run - logger.comment("#TOBIAS PlotHeatmap (run started {0})\n".format(begin_time)) - logger.comment("#Command line call: {0}\n".format(" ".join(sys.argv))) - - parser = add_heatmap_arguments(argparse.ArgumentParser()) - logger.comment(arguments_overview(parser, args)) - - ######################################################## #Check valid input parameters (number of input TFBS vs. bigwig etc.) @@ -115,10 +108,8 @@ def run_heatmap(args): for i, signal in enumerate(args.signals): logger.info("Using {0} with signal from {1}".format(args.TFBS[i], signal)) - - ###logger overview of bedfiles per column? + #todo: logger overview of bedfiles per column? - ###################################################################################### ##################################### INPUT DATA ##################################### ###################################################################################### @@ -144,10 +135,6 @@ def run_heatmap(args): for row in range(len(heatmap_info[col])): heatmap_info[col][row]["regions"] = RegionList().from_bed(heatmap_info[col][row]["bed_f"]) - - #if len(heatmap_info[col][row]["regions"]) == 0: - #del heatmap_info[col][row] - #continue #Estimate region width distri = heatmap_info[col][row]["regions"].get_width_distri() @@ -358,10 +345,8 @@ def run_heatmap(args): heatmap_info[col][row]["vmin"] = -lim heatmap_info[col][row]["vmax"] = lim heatmap = axdict[col][row].imshow(heatmap_info[col][row]["signal_mat"], aspect="auto", cmap="seismic", norm=mpl.colors.Normalize(vmin=heatmap_info[col][row]["vmin"], vmax=heatmap_info[col][row]["vmax"])) - #aspect="auto", extent=[-flank, flank, 0, TFBS["bound"]["no_sites"]] #Insert colorbar (inserted multiple times for each bigwig, but since it is shared for the same bigwig, it doesn't matter) - #axin = inset_axes(axdict[col]["colorbar"], width="100%", height="100%", loc=8, borderpad=-4) fig.colorbar(heatmap, cax=axdict[col]["colorbar"], orientation="horizontal") @@ -425,7 +410,7 @@ def run_heatmap(args): plt.setp(axarr[row].get_xticklabels(), visible=False) #Hide x-axis ticks axarr[row].tick_params(direction="in") """ - #plt.tight_layout() + plt.subplots_adjust(top=0.95) plt.suptitle(args.title, fontsize=25) @@ -433,8 +418,7 @@ def run_heatmap(args): plt.savefig(args.output, bbox_inches='tight') plt.close() - logger.info("Finished PlotHeatmap! Output file is: {0}".format(os.path.abspath(args.output))) - + logger.end() #--------------------------------------------------------------------------------------------------------# if __name__ == '__main__': diff --git a/tobias/utils/logger.py b/tobias/utils/logger.py new file mode 100644 index 0000000..ff30833 --- /dev/null +++ b/tobias/utils/logger.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +""" +Class for dealing with logger-function used across all TOBIAS tools + +@author: Mette Bentsen +@contact: mette.bentsen (at) mpi-bn.mpg.de +@license: MIT + +""" + +import sys +import os +from datetime import datetime +import logging +import multiprocessing as mp +import time + + +def add_logger_args(args): + """ Function for adding TOBIAS-wide verbosity to command-line parsers """ + args.add_argument('--verbosity', metavar="", help="Level of output logging (0: silent, 1: errors/warnings, 2: info, 3: stats, 4: debug, 5: spam) (default: 3)", choices=[0,1,2,3,4,5], default=3, type=int) + return(args) + +class TobiasLogger(logging.Logger): + """ TobiasLogger is an instance of a logging.Logger with special functions for formatting and creating automatic logging """ + + logger_levels = { + 0: 0, + 1: logging.WARNING, #also includes errors + 2: logging.INFO, #info + 3: int((logging.INFO+logging.DEBUG)/2), #statistics + 4: logging.DEBUG, #debugging info + 5: logging.DEBUG - 5 #spam-level debug + } + + def __init__(self, tool_name="TOBIAS", level=3, queue=None): + + self.tool_name = tool_name #name of tool within TOBIAS + logging.Logger.__init__(self, self.tool_name) + + if level == 0: + self.disabled = True + + ####### Setup custom levels ####### + #Create custom level for comments (Same level as errors/warnings) + comment_level = TobiasLogger.logger_levels[1] + 1 + logging.addLevelName(comment_level, "comment") #log_levels[lvl]) + setattr(self, 'comment', lambda *args: self.log(comment_level, *args)) + + #Create custom level for stats (between info and debug) + stats_level = TobiasLogger.logger_levels[3] + logging.addLevelName(stats_level, "STATS") #log_levels[lvl]) + setattr(self, 'stats', lambda *args: self.log(stats_level, *args)) + + #Create custom level for spamming debug messages + spam_level = TobiasLogger.logger_levels[5] + logging.addLevelName(spam_level, "SPAM") #log_levels[lvl]) + setattr(self, 'spam', lambda *args: self.log(spam_level, *args)) + + #Set level + self.level = TobiasLogger.logger_levels[level] + + ######### Setup formatter ######## + + #Setup formatting + self.formatter = TOBIASFormatter() + self.setLevel(self.level) + + ########## Setup streaming ######### + ##Log file stream + #if log_f != None: + # log = logging.FileHandler(log_f, "w") + # log.setLevel(self.level) + # log.setFormatter(self.formatter) + # self.addHandler(log) + + if queue == None: + #Stdout stream + con = logging.StreamHandler(sys.stdout) #console output + con.setLevel(self.level) + con.setFormatter(self.formatter) + self.addHandler(con) + else: + h = logging.handlers.QueueHandler(queue) # Just the one handler needed + self.handlers = [] + self.addHandler(h) + + #Lastly, initialize time + self.begin_time = datetime.now() + self.end_time = None + self.total_time = None + + + def begin(self): + """ Begin logging by writing comments about the current run """ + + #Print info on run + self.comment("# TOBIAS {0} (run started {1})".format(self.tool_name, self.begin_time)) + self.comment("# Working directory: {0}".format(os.getcwd())) + self.comment("# Command line call: {0}\n".format(" ".join(sys.argv))) + + def stop(self): + """ Stop without printing status """ + + self.end_time = datetime.now() + self.total_time = self.end_time - self.begin_time + + def end(self): + """ End logging - write out how long it took """ + + self.end_time = datetime.now() + self.total_time = self.end_time - self.begin_time + + self.comment("") #creates empty line; only for pretty output + self.info("Finished {0} run (total time elapsed: {1})".format(self.tool_name, self.total_time)) + + + def start_logger_queue(self): + """ start process for listening and handling through the main logger queue """ + + self.debug("Starting logger queue for multiprocessing") + self.queue = mp.Manager().Queue() + self.listener = mp.Process(target=self.main_logger_process) + self.listener.start() + + + def stop_logger_queue(self): + """ Stop process for listening """ + + self.debug("Waiting for listener to finish") + self.queue.put(None) + while self.listener.exitcode != 0: + self.debug("Listener exitcode is: {0}. Waiting for exitcode = 0.".format(self.listener.exitcode)) + time.sleep(0.1) + + self.debug("Joining listener") + self.listener.join() + + + def main_logger_process(self): + + self.debug("Started main logger process") + while True: + try: + record = self.queue.get() + if record is None: + break + self.handle(record) #this logger is coming from the main process + + except Exception: + import sys, traceback + print('Problem in main logger process:', file=sys.stderr) + traceback.print_exc(file=sys.stderr) + break + + return(1) + + + def arguments_overview(self, parser, args): + """ Creates string of arguments and options to print using logger """ + + content = "" + content += "# ----- Input parameters -----\n" + for group in parser._action_groups: + group_actions = group._group_actions + if len(group_actions) > 0: + #content += "# ----- {0} -----\n".format(group.title) + for option in group_actions: + name = option.dest + attr = getattr(args, name, None) + content += "# {0}:\t{1}\n".format(name, attr) + #content += "\n" + self.comment(content + "\n") + + + def output_files(self, outfiles): + """ Print out list of output files""" + + self.comment("# ----- Output files -----") + for outf in outfiles: + if outf != None: + self.comment("# {0}".format(outf)) + self.comment("\n") + + + +class TOBIASFormatter(logging.Formatter): + + default_fmt = logging.Formatter("%(asctime)s (%(process)d) [%(levelname)s]\t%(message)s", "%Y-%m-%d %H:%M:%S") + comment_fmt = logging.Formatter("%(message)s") + + def format(self, record): + format_orig = self._fmt + + #Comments + if record.levelname == "comment": + return self.comment_fmt.format(record) + elif record.levelno != 0: + return self.default_fmt.format(record) + else: + return diff --git a/tobias/utils/motifs.py b/tobias/utils/motifs.py index 995f52b..083c713 100644 --- a/tobias/utils/motifs.py +++ b/tobias/utils/motifs.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Classes for working with motifs and scanning +Classes for working with motifs and scanning with moods @author: Mette Bentsen @contact: mette.bentsen (at) mpi-bn.mpg.de @@ -16,6 +16,12 @@ import MOODS.tools import MOODS.parsers +import matplotlib as mpl +import matplotlib.pyplot as plt +from matplotlib.text import TextPath +from matplotlib.patches import PathPatch +from matplotlib.font_manager import FontProperties + #Internal from tobias.utils.regions import * from tobias.utils.utilities import filafy #filafy for filenames @@ -81,20 +87,25 @@ def scan_sequence(self, seq, region): #Contains info on one motif formatted for use in moods class OneMotif: - def __init__(self, motifid, alt_name, pfm): + bases = ["A", "C", "G", "T"] + + def __init__(self, motifid, alt_name, counts): - self.id = motifid #must be unique + self.id = motifid #must be unique self.alt_name = alt_name #does not have to be unique - self.name = "" #name set in set_name - self.pfm = pfm - self.strand = "+" #default strand is + + self.name = "" #name set in set_name + #self.out_name + self.counts = counts #counts + self.strand = "+" #default strand is + #Set later + self.pfm = None self.bg = np.array([0.25,0.25,0.25,0.25]) #background set to equal by default self.pssm = None #pssm calculated from get_pssm self.threshold = None #threshold calculated from get_threshold - + self.bits = None + def __str__(self): return("{0}".format(self.__dict__)) @@ -113,8 +124,14 @@ def set_name(self, naming="name_id"): prefix = "None" self.name = prefix + def get_pfm(self): + self.pfm = self.counts / np.sum(self.counts, axis=0) def get_reverse(self): + + if self.pfm is None: + self.get_pfm() + reverse_motif = copy.deepcopy(self) reverse_motif.strand = "-" reverse_motif.pfm = MOODS.tools.reverse_complement(self.pfm,4) @@ -123,6 +140,8 @@ def get_reverse(self): def get_pssm(self, ps=0.01): """ """ + if self.pfm is None: + self.get_pfm() bg_col = self.bg.reshape((-1,1)) pseudo_vector = ps * bg_col @@ -133,11 +152,78 @@ def get_pssm(self, ps=0.01): def get_threshold(self, pvalue): + if self.pssm is None: + self.get_pssmm() self.threshold = MOODS.tools.threshold_from_p(self.pssm, self.bg, pvalue, 4) return(self) + def calc_bit_score(self): + + if self.pfm is None: + self.get_pfm() + + pfm_arr = np.copy(self.pfm) + pfm_arr[pfm_arr == 0] = np.nan + + #Info content per pos + entro = pfm_arr * np.log2(pfm_arr) + entro[np.isnan(entro)] = 0 + info_content = 2 - (- np.sum(entro, axis=0)) #information content per position in motif + self.bits = self.pfm * info_content + + + def plot_logo(self): + + LETTERS = { "T" : TextPath((-0.305, 0), "T", size=1, prop=fp), + "G" : TextPath((-0.384, 0), "G", size=1, prop=fp), + "A" : TextPath((-0.35, 0), "A", size=1, prop=fp), + "C" : TextPath((-0.366, 0), "C", size=1, prop=fp) } + COLOR_SCHEME = {'G': 'orange', + 'A': "#CC0000", + 'C': 'mediumblue', + 'T': 'darkgreen'} + + def add_letter(base, x, y, scale, ax): + """ """ + + text = LETTERS[base] + + t = mpl.transforms.Affine2D().scale(1*globscale, scale*globscale) + \ + mpl.transforms.Affine2D().translate(x,y) + ax.transData + p = PathPatch(text, lw=0, fc=COLOR_SCHEME[base], transform=t) + if ax != None: + ax.add_artist(p) + return p + + self.calc_bit_score() + self.length = self.bits.shape[1] + + fp = FontProperties(family='sans-serif', weight="bold") + globscale = 1.35 + + #Plot logo + fig, ax = plt.subplots(figsize=(10,3)) + max_y = 0 + for position in range(self.length): #0-based positions + + base_bit_tups = zip(OneMotif.bases, self.bits[:,position]) + + #Go through bases sorted from lowest to highest bit score + y = 0 #position to place letter + for (base, score) in sorted(base_bit_tups, key=lambda tup: tup[1]): + add_letter(base, position+1, y, score, ax) + y += score + max_y = max(max_y, y) + + plt.xticks(range(1,self.length+1)) + plt.xlim((0.2, self.length+0.8)) + plt.ylim((0, max_y)) + plt.tight_layout() + + return(fig, ax) + ########################################################### @@ -218,7 +304,11 @@ def convert_motif(content, output_format): #Get ready for this motif if idx < len(lines) - 1: #Up until the last line, it is possible to save for next columns = line.strip().split() - motif_id, name = columns[1], columns[2] + if len(columns) > 2: #MOTIF, ID, NAME + motif_id, name = columns[1], columns[2] + elif len(columns) == 2: # MOTIF, ID + motif_id, name = columns[1], columns[1] + header = ">{0}\t{1}\n".format(motif_id, name) converted_content += header @@ -271,7 +361,6 @@ def convert_motif(content, output_format): columns = [field for field in m.group(1).rstrip().split()] motif_content.append(columns) - return(converted_content) @@ -301,7 +390,7 @@ def pfm_to_motifs(content): if m: pfm_line = m.group(1) pfm_fields = [float(field) for field in pfm_line.rstrip().split()] - motif_obj.pfm.append(pfm_fields) + motif_obj.counts.append(pfm_fields) else: continue diff --git a/tobias/utils/ngs.pyx b/tobias/utils/ngs.pyx index 164edc0..2a9966e 100644 --- a/tobias/utils/ngs.pyx +++ b/tobias/utils/ngs.pyx @@ -1,4 +1,4 @@ -#!/usr/bin/env python +# cython: language_level=3 """ Classes to work with NGS data (reads, readlists, etc.) @@ -11,7 +11,7 @@ Classes to work with NGS data (reads, readlists, etc.) import pysam from collections import Counter -from regions import * +from tobias.utils.regions import * import time import sys diff --git a/tobias/utils/regions.py b/tobias/utils/regions.py index d888ed9..83db832 100644 --- a/tobias/utils/regions.py +++ b/tobias/utils/regions.py @@ -16,6 +16,11 @@ import pyBigWig from collections import Counter +#Clustering +import sklearn.preprocessing as preprocessing +from scipy.cluster.hierarchy import dendrogram, linkage, fcluster +from scipy.spatial.distance import squareform + #--------------------------------------------------------------------------------------------------# #--------------------------------------------------------------------------------------------------# #--------------------------------------------------------------------------------------------------# @@ -136,6 +141,8 @@ def check_boundary(self, boundaries_dict, action="cut"): return(self) + + def get_signal(self, pybw, numpy_bool = True): """ Get signal from bigwig in region """ @@ -152,7 +159,6 @@ def get_signal(self, pybw, numpy_bool = True): except: print("Error reading region: {0} from pybigwig object".format(self.tup())) signal = np.zeros(self.end - self.start) - signal[:] = np.nan return(signal) @@ -214,16 +220,7 @@ def as_bed(self, additional=True): bed = "" for region in self: - #Update info - line = "{0}\n".format("\t".join([str(col) for col in region])) - - #columns = [region.chrom, region.start, region.end, region.name, region.score, region.strand] - - #if additional == True: - #line = "{0}\n".format("\t".join([str(col) for col in columns])) - #else: - #line = "{0}\n".format("\t".join([str(col) for col in columns[:3]])) bed += line return(bed) @@ -374,45 +371,6 @@ def remove_duplicates(self): return(unique) - """ - def intersect(self, b_regions): - # Overlap regions with other regions - only return those regions in a with an overlap in b - - #Sort before starting - self.loc_sort() - b_regions.loc_sort() - - a_len = self.count() - b_len = b_regions.count() - - chromlist = sorted(list(set([region.chrom for region in self] + [region.chrom for region in b_regions]))) - chrom_pos = {chrom:chromlist.index(chrom) for chrom in chromlist} - - a = self - b = b_regions - - a_i = 0 - b_i = 0 - - while a_i < a_len and b_i < b_len: - - a_chrom, a_start, a_end = a[a_i].chrom, a[a_i].start, a[a_i].end - b_chrom, b_start, b_end = b_regions[b_i].chrom, b_regions[b_i].start, b_regions[b_i].end - - #Do comparison - if a_chrom == b_chrom: - - - - #a is ahead of b -> check next b - elif chrom_pos[a_chrom] > chrom_pos[b_chrom]: #if a_chrom is after current b_chrom - b_i += 1 - - #b is ahead of a -> check next a - elif chrom_pos[b_chrom] > chrom_pos[a_chrom]: #if b_chrom is after current a_chrom - a_i += 1 - """ - def subtract(self, b_regions): """ Subtract b_regions from self regions """ @@ -606,28 +564,129 @@ def count_overlaps(self): #------------------------------ Additional functions related to regions ---------------------------# #--------------------------------------------------------------------------------------------------# -def overlap_to_distance(overlap_dict): - """ """ +class RegionCluster: + + def __init__(self, overlap_dict): + + self.overlaps = overlap_dict #overlap dict is from RegionList.count_overlaps + + #Added later + self.clusters = {} # ID:{"member_idx":[], "member_names":[], "cluster_name":""} + + def cluster(self, threshold=0.5, method="average"): + """ Main function to cluster the overlap dictionary into clusters""" + + self.overlap_to_distance() + self.linkage_mat = linkage(squareform(self.distance_mat), method) + self.labels = fcluster(self.linkage_mat, threshold, criterion="distance") #ordering of the dendrogram + + #Find clusters below threshold + self.linkage_clusters = dict(zip(range(self.n), [[num] for num in range(self.n)])) + for i, row in enumerate(self.linkage_mat): + ID1 = int(row[0]) + ID2 = int(row[1]) + new = self.n + i + dist = row[2] + + if dist <= threshold: + self.linkage_clusters[new] = self.linkage_clusters[ID1] + self.linkage_clusters[ID2] + [new] + del self.linkage_clusters[ID1] + del self.linkage_clusters[ID2] + + #Add member-names to clusters + self.clusters = {} + for cluster in self.linkage_clusters: + + self.clusters[cluster] = {"member_idx": [idx for idx in self.linkage_clusters[cluster] if idx < self.n]} + self.clusters[cluster]["member_names"] = [self.names[idx] for idx in self.clusters[cluster]["member_idx"]] + + self.get_cluster_names() #Set names of clusters + self.assign_colors() + + def overlap_to_distance(self): + """ Convert overlap-dict from count_overlaps to distance matrix """ + + #Find all region names + names = [key for key in self.overlaps.keys() if isinstance(key, str)] + names = sorted(names) + self.n = len(names) + + distance_mat = np.zeros((self.n, self.n)) + + for i, id1 in enumerate(names): #rows + for j, id2 in enumerate(names): #columns + if i != j: + tot_overlap = self.overlaps.get((id1,id2), 0) #Find key-pair otherwise no overlap - #Find all TF names - names = [key for key in overlap_dict.keys() if isinstance(key, str)] - names = sorted(names) - no_ids = len(names) + tot_id1 = self.overlaps[id1] + tot_id2 = self.overlaps[id2] - similarity_mat = np.zeros((no_ids, no_ids)) + id1_dist = 1 - tot_overlap / float(tot_id1) + id2_dist = 1 - tot_overlap / float(tot_id2) - for i, id1 in enumerate(names): #rows - for j, id2 in enumerate(names): #columns - if i != j: - tot_overlap = overlap_dict.get((id1,id2), 0) #Find key-pair otherwise no overlap + dist = min([id1_dist, id2_dist]) + distance_mat[i,j] = dist - tot_id1 = overlap_dict[id1] - tot_id2 = overlap_dict[id2] + self.distance_mat = distance_mat + self.names = names - id1_dist = 1 - tot_overlap / float(tot_id1) - id2_dist = 1 - tot_overlap / float(tot_id2) - dist = min([id1_dist, id2_dist]) - similarity_mat[i,j] = dist + def get_cluster_names(self): + """ Names each cluster based on the members of the cluster """ + + self.cluster_names = {} + self.name2cluster = {} + + #Sort clusters by distance scores to other motifs in cluster + cluster_i = 1 + for cluster in self.clusters: + + members_idx = self.clusters[cluster]["member_idx"] + + ### Code to assign a specific name to clusters + """ + members_idx = self.clusters[cluster]["member_idx"] + members_names = self.clusters[cluster]["member_names"] + distances = {} + for member in members_idx: + distances[member] = np.sum(self.distance_mat[member,:]) #0 if member is one-member cluster + + #Sort cluster by distance; smallest values=most representative first + ordered_idx = sorted(distances.keys(), key=lambda member: distances[member]) + + self.cluster_names[cluster] = "C_" + self.names[ordered_idx[0]] #cluster is the idx for cluster + """ + + self.cluster_names[cluster] = "C_{0}".format(cluster_i) + cluster_i += 1 + + #Assign every member to its cluster + for member in members_idx: + self.name2cluster[self.names[member]] = self.cluster_names[cluster] + + + def write_distance_mat(self, out_f): + """ Writes out distance matrix to out_f file """ + np.savetxt(out_f, self.distance_mat, delimiter="\t", header="\t".join(self.names), fmt="%.4f") + + + def assign_colors(self): + """ Assign colors for plotting the dendrogram """ + + clusters = self.linkage_clusters + no_IDS = self.n + + colorlist = ["blue", "green", "red", "orange"] + node_color = ["black"] * (2*no_IDS-1) + i = 0 + for cluster in sorted(list(clusters.keys())): + if len(clusters[cluster]) > 1: + color = colorlist[i] + for node in clusters[cluster]: + node_color[node] = color + i += 1 + + if i == len(colorlist): + i = 0 - return((similarity_mat, names)) \ No newline at end of file + self.node_color = node_color #list corresponding to each possible clustering in tree \ No newline at end of file diff --git a/tobias/utils/sequences.pyx b/tobias/utils/sequences.pyx index d28d9ad..b2876fa 100644 --- a/tobias/utils/sequences.pyx +++ b/tobias/utils/sequences.pyx @@ -1,5 +1,4 @@ -#!/usr/bin/env python -#cython: language_level=3 +# cython: language_level=3 """ Classes and functions for working with sequences, as well as counting and scoring with PWMs/DWMs diff --git a/tobias/utils/signals.pyx b/tobias/utils/signals.pyx index aae55b6..a472bbf 100644 --- a/tobias/utils/signals.pyx +++ b/tobias/utils/signals.pyx @@ -1,4 +1,4 @@ -#!/usr/bin/env python +# cython: language_level=3 """ Classes for working with signal arrays (such as bias and cutsite signals) @@ -182,48 +182,50 @@ def calc_FOS(np.ndarray[np.float64_t, ndim=1] arr, int fp_min, int fp_max, int f @cython.boundscheck(False) #dont check boundaries @cython.cdivision(True) #no check for zero division @cython.wraparound(False) #dont deal with negative indices -def tobias_footprint_array(np.ndarray[np.float64_t, ndim=1] arr, int fp_min, int fp_max, int flank_min, int flank_max): +def tobias_footprint_array(np.ndarray[np.float64_t, ndim=1] arr, int window, int fp_min, int fp_max): cdef int L = arr.shape[0] cdef np.ndarray[np.float64_t, ndim=1] footprint_scores = np.zeros(L) - cdef int i, j, footprint_w, flank_w + cdef int i, j, footprint_w, flank_w, flank_left, flank_right cdef float fp_sum, fp_mean, fp_score, flank_mean cdef float left_sum, right_sum + cdef int flank_max = int((window - fp_min)/2) + #Each position in array, starting at i, going until last possible start of region for i in range(L-2*flank_max-fp_max): - for flank_w in range(flank_min, flank_max+1): - for footprint_w in range(fp_min, fp_max+1): + for footprint_w in range(fp_min, fp_max+1): + flank_left = int((window - footprint_w)/2) + flank_right = window - flank_left - footprint_w + + #Sum of left flanking window + left_sum = 0 + for j in range(flank_left): + if arr[i+j] > 0: + left_sum += arr[i+j] + + #Sum of footprint + fp_sum = 0 + for j in range(footprint_w): + if arr[i+j] < 0: + fp_sum += arr[i+flank_left+j] + + #Sum of right flank + right_sum = 0 + for j in range(flank_right): + if arr[i+j] > 0: + right_sum += arr[i+flank_left+footprint_w+j] - #Sum of left flank - left_sum = 0 - for j in range(flank_w): - if arr[i+j] > 0: - left_sum += arr[i+j] - - #Sum of footprint - fp_sum = 0 - for j in range(footprint_w): - if arr[i+j] < 0: - fp_sum += arr[i+flank_w+j] - - #Sum of right flank - right_sum = 0 - for j in range(flank_w): - if arr[i+j] > 0: - right_sum += arr[i+flank_w+footprint_w+j] - - #Calculate score - fp_mean = fp_sum/(1.0*footprint_w) #will be minus - flank_mean = (right_sum + left_sum)/(2.0*flank_w) + #Calculate score + fp_mean = fp_sum/(1.0*footprint_w) #will be minus + flank_mean = (right_sum + left_sum)/(flank_right + flank_left) - fp_score = flank_mean - fp_mean #-- = + - - #Save score across footprint - for pos in range(i+flank_w, i+flank_w+footprint_w): - if fp_score > footprint_scores[pos]: - footprint_scores[pos] = fp_score + fp_score = flank_mean - fp_mean #-- = + + + #Save score across footprint + for pos in range(i+flank_left, i+flank_left+footprint_w): + if fp_score > footprint_scores[pos]: + footprint_scores[pos] = fp_score return(footprint_scores) - diff --git a/tobias/utils/utilities.py b/tobias/utils/utilities.py index e9c73ca..3b39130 100644 --- a/tobias/utils/utilities.py +++ b/tobias/utils/utilities.py @@ -18,103 +18,64 @@ import textwrap import string import numpy as np +import copy from difflib import SequenceMatcher +import pyBigWig +from tobias.utils.logger import * + #-------------------------------------------------------------------------------------------# -#----------------------------------- Logger stuff ------------------------------------------# +#----------------------------------- Multiprocessing ---------------------------------------# #-------------------------------------------------------------------------------------------# -class TOBIASFormatter(logging.Formatter): - - default_fmt = logging.Formatter("%(asctime)s (%(processName)s)\t%(message)s", "%Y-%m-%d %H:%M:%S") - comment_fmt = logging.Formatter("%(message)s") - - #def __init__(self, fmt="(%(processName)s) %(asctime)s\t\t%(message)s"): - # logging.Formatter.__init__(self, fmt) - - def format(self, record): - format_orig = self._fmt - - #Comments - if record.levelno > logging.CRITICAL: - return self.comment_fmt.format(record) - else: - return self.default_fmt.format(record) - - -def create_logger(verbosity=2, log_f=None): - - #verbose_levels = {1:logging.CRITICAL, 2:logging.ERROR, 3:logging.WARNING, 4:logging.INFO, 5:logging.DEBUG} - verbose_levels = {1:logging.CRITICAL, 2:logging.INFO, 3:logging.DEBUG} - logger = logging.getLogger("TOBIAS") #Set logger specifically to not get other module logs (which write to root) - logger.setLevel(verbose_levels[verbosity]) - - formatter = TOBIASFormatter() +def run_parallel(FUNC, input_chunks, arguments, n_cores, logger): + """ + #FUNC is the function to run + #input_chunks is the input to loop over + #arguments are arguments to func + #logger is a Logging.Logger object + """ + + no_chunks = len(input_chunks) + + if n_cores > 1: + + #Send jobs to pool + pool = mp.Pool(processes=n_cores) + task_list = [] + for input_chunk in input_chunks: + task_list.append(pool.apply_async(FUNC, args=[input_chunk] + arguments)) + pool.close() #done sending jobs to pool + + #Wait for tasks to finish + count = -1 + finished = sum([task.ready() for task in task_list]) + while finished < no_chunks: + finished = sum([task.ready() for task in task_list]) + if count != finished: + logger.info("Progress: {0:.0f}%".format(finished/float(no_chunks)*100)) + count = finished + else: + time.sleep(0.5) + pool.join() - #Log file stream - if log_f != None: - log = logging.FileHandler(log_f, "w") - log.setLevel(verbose_levels[verbosity]) - log.setFormatter(formatter) - logger.addHandler(log) + #Get results from processes + output_list = [task.get() for task in task_list] - #Stdout stream else: - con = logging.StreamHandler(sys.stdout) #console output - con.setLevel(verbose_levels[verbosity]) - con.setFormatter(formatter) - logger.addHandler(con) - - - #Create custom level for comments (always shown) - comment_level = logging.CRITICAL+10 - logging.addLevelName(comment_level, "comment") #log_levels[lvl]) - setattr(logger, 'comment', lambda *args: logger.log(comment_level, *args)) - - return(logger) - -def create_mp_logger(verbosity, queue): - - verbose_levels = {1:logging.CRITICAL, 2:logging.INFO, 3:logging.DEBUG} + output_list = [] + for count, input_chunk in enumerate(input_chunks): + logger.info("Progress: {0:.0f}%".format(count/float(no_chunks)*100)) + output_list.append(FUNC(input_chunk, *arguments)) - h = logging.handlers.QueueHandler(queue) # Just the one handler needed - logger = logging.getLogger("Worker") - logger.handlers = [] - logger.addHandler(h) - logger.setLevel(verbose_levels[verbosity]) - - return(logger) - + return(output_list) -def main_logger_process(queue, logger): - - logger.debug("Started main logger process") - while True: - try: - record = queue.get() - if record is None: - break - logger.handle(record) - - except Exception: - import sys, traceback - print('Problem in main logger process:', file=sys.stderr) - traceback.print_exc(file=sys.stderr) - break - - return(1) - - -#-------------------------------------------------------------------------------------------# -#----------------------------------- Multiprocessing ---------------------------------------# -#-------------------------------------------------------------------------------------------# def file_writer(q, key_file_dict, args): """ File-writer per key -> to value file """ - #time_spent_writing = 0 #Open handles for all files (but only once per file!) file2handle = {} for fil in set(key_file_dict.values()): @@ -136,12 +97,7 @@ def file_writer(q, key_file_dict, args): if key == None: break - #begin_time = time.time() - #print("writing content of {0} chars for TF {1}".format(len(content), TF)) handles[key].write(content) - #end_time = time.time() - #time_spent_writing += end_time - begin_time - #except Queue.Empty: except Exception: import sys, traceback @@ -156,23 +112,128 @@ def file_writer(q, key_file_dict, args): return(1) -def monitor_progress(task_list, logger): +def bigwig_writer(q, key_file_dict, header, regions, args): + """ Handle queue to write bigwig, args contain extra info such as verbosity and log_q """ + + #todo: check if args.log_q exists + logger = TobiasLogger("", args.verbosity, args.log_q) #separate core, needs mp logger through queue + logger.debug("Opened bigwig writer process for {0}".format(key_file_dict)) + + logger.debug("Header: {0}".format(header)) + + handles = {} + for key in key_file_dict: + logger.debug("Opening file {0} for writing".format(key_file_dict[key])) + try: + handles[key] = pyBigWig.open(key_file_dict[key], "w") + handles[key].addHeader(header) + + except Exception: + print("Tried opening file {0} in bigwig_writer but something went wrong?".format(fil)) + traceback.print_exc(file=sys.stderr) + + #Correct order of chromosomes as given in header + contig_list = [tup[0] for tup in header] + order_dict = dict(zip(contig_list, range(len(contig_list)))) + + #Establish order of regions to be writteninput regions + region_tups = [(region.chrom, region.start, region.end) for region in regions] + sorted_region_tups = sorted(region_tups, key=lambda tup: (order_dict[tup[0]], tup[1])) #sort to same order as bigwig header + no_regions = len(region_tups) + + #logger.debug("Order of regions: {0}".format(list(set([tup[0] for tup in sorted_region_tups])))) + + #Fetching content from queue + logger.debug("Fetching content from queue") + + i_to_write = {key:0 for key in handles} #index of next region to write + ready_to_write = {key:{} for key in handles} #key:dict; dict is region-tup:signal array + while True: + + try: + (key, region, signal) = q.get() #key is the bigwig key (e.g. bias:forward), region is a tup of (chr, start, end) + logger.spam("Received signal {0} for region {1}".format(key, region)) + + if key == None: #none is only inserted once all regions have been sent + for akey in i_to_write: + if i_to_write[akey] != no_regions - 1: + logger.error("Wrote {0} regions but there are {1} in total".format(i_to_write[akey], len(region_tups))) + logger.error("Ready_to_write[{0}]: {1}".format(akey, len(ready_to_write[akey]))) + break + + #Save key:region:signal to ready_to_write + ready_to_write[key][region] = signal + + #Check if next-to-write region was done + for key in handles: + + #Only deal with writing if there are still regions to write for this handle + if i_to_write[key] != no_regions - 1: + next_region = sorted_region_tups[i_to_write[key]] #this is the region to be written next for this key + + #If results are in; write wanted entry to bigwig + while next_region in ready_to_write[key]: #When true: Keep writing when the next region is available + chrom = next_region[0] + signal = ready_to_write[key][next_region] + included = signal.nonzero()[0] + positions = np.arange(next_region[1],next_region[2]) #start-end (including end) + pos = positions[included] + val = signal[included] + + if len(pos) > 0: + try: + handles[key].addEntries(chrom, pos, values=val, span=1) + except: + logger.error("Error writing key: {0}, region: {1} to bigwig".format(key, next_region)) + logger.spam("Wrote signal {0} from region {1}".format(key, next_region)) + + #Check whether this was the last region + if i_to_write[key] == no_regions - 1: #i_to_write is the last idx in regions; all sites were written + logger.info("Closing {0} (this might take some time)".format(key_file_dict[key])) + handles[key].close() + next_region = None #exit the while loop + else: + i_to_write[key] += 1 + next_region = sorted_region_tups[i_to_write[key]] #this is the region to be written next for this key + + #Write out progress (and only once per step) + + #Write out progress + #write out progress only after writing something + #check if i_to_write[key] is a certain percentage of len(regions) + + except Exception: + import sys, traceback + print('Problem in file_writer:', file=sys.stderr) + traceback.print_exc(file=sys.stderr) + break + + return(1) + + + +def monitor_progress(task_list, logger, prefix="Progress"): prev_done = 0 no_tasks = len(task_list) done = sum([task.ready() for task in task_list]) - logger.info("Progress 0%") + logger.info("{0} 0%".format(prefix)) while done != no_tasks: done = sum([task.ready() for task in task_list]) if done != prev_done: - logger.info("Progress {0}%".format(round(done/no_tasks*100.0, max(0,len(str(no_tasks))-1)))) + #print if done is + + logger.info("{1} {0}%".format(round(done/no_tasks*100.0, max(0,len(str(no_tasks))-1)), prefix)) prev_done = done else: time.sleep(0.1) - logger.info("All tasks completed!") + + logger.info("{0} done!".format(prefix)) return() #doesn't return until the while loop exits + + #-------------------------------------------------------------------------------------------# #------------------------------------- Argparser -------------------------------------------# #-------------------------------------------------------------------------------------------# @@ -183,6 +244,12 @@ def restricted_float(f, f_min, f_max): raise argparse.ArgumentTypeError("{0} not in range [0.0, 1.0]".format(f)) return f +def restricted_int(integer, i_min, i_max): + integer = float(integer) + if integer < i_min or integer > i_max: + raise + + def format_help_description(name, description, width=90): """ Format description of command line tool --help description """ @@ -205,24 +272,6 @@ def format_help_description(name, description, width=90): return(formatted) - -def arguments_overview(parser, args): - """ Return string of arguments and options to print to stdout/log""" - - content = "" - content += "# ----- Input parameters -----\n" - for group in parser._action_groups: - group_actions = group._group_actions - if len(group_actions) > 0: - #content += "# ----- {0} -----\n".format(group.title) - for option in group_actions: - name = option.dest - attr = getattr(args, name, None) - content += "# {0}:\t{1}\n".format(name, attr) - #content += "\n" - return(content) - - #-------------------------------------------------------------------------------------------# #---------------------------------------- Misc ---------------------------------------------# #-------------------------------------------------------------------------------------------# @@ -270,21 +319,30 @@ def make_directory(directory): os.makedirs(directory) +def merge_dicts(dicts): + """ Merge recursive keys and values for list of dicts into one dict. Values are added numerically / lists are extended / numpy arrays are added""" -def merge_dicts(dct, merge_dct): - """ Merge recursive keys and values of merge_dct into dct. Values are added numerically + def merger(dct, dct_to_add): + """ Merge recursive keys and values of dct_to_add into dct. Values are added numerically / lists are extended / numpy arrays are added No return - dct is changed in place """ - for k, v in merge_dct.items(): + for k, v in dct_to_add.items(): - #If k is a dict, go one level down - if (k in dct and isinstance(dct[k], dict)): # and isinstance(merge_dct[k], collections.Mapping)): - merge_dicts(dct[k], merge_dct[k]) - else: - if not k in dct: - dct[k] = merge_dct[k] - else: - dct[k] += merge_dct[k] + #If k is a dict, go one level down + if (k in dct and isinstance(dct[k], dict)): + merger(dct[k], dct_to_add[k]) + else: + if not k in dct: + dct[k] = dct_to_add[k] + else: + dct[k] += dct_to_add[k] + + #Initialize with the first dict in list + out_dict = copy.deepcopy(dicts[0]) + for dct in dicts[1:]: + merger(out_dict, dct) + + return(out_dict) def filafy(astring): #Make name into accepted filename @@ -306,7 +364,7 @@ def get_closest(value, arr): #-------------------------------------------------------------------------------------------# def common_prefix(strings): - """ Find the longest string that is a prefix of all the strings """ + """ Find the longest string that is a prefix of all the strings. Used in PlotChanges to find TF names from list """ if not strings: return '' prefix = strings[0] @@ -323,7 +381,7 @@ def common_prefix(strings): def match_lists(lofl): # list of lists - """ Find matches between list1 and list2 (output will be the length of list1 with one or more matches per element) """ + """ Find matches between list1 and list2 (output will be the length of list1 with one or more matches per element).""" #Remove common prefixes/suffixes per list prefixes = []