Skip to content
This repository has been archived by the owner. It is now read-only.

Commit

Permalink
0.6.2: internal changes to bindetect
Browse files Browse the repository at this point in the history
  • Loading branch information
msbentsen committed Jun 19, 2019
1 parent 8cad568 commit 4f346b5
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 36 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.6.2 (2019-06-19)
- Added --skip-excel option to skip excel overview per factor in BINDetect (can slow the run considerably for large output)
- Internal changes to BINDetect for better debug overview
- Fixed normalization for plotAggregate

## 0.6.1 (2019-06-05)
- Fixed motif cluster names to prefix C_
- Added --filter function to format_motifs to filter output motifs using names/ids
Expand Down
2 changes: 1 addition & 1 deletion tobias/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.6.1"
__version__ = "0.6.2"
5 changes: 4 additions & 1 deletion tobias/footprinting/bindetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def add_bindetect_arguments(parser):
optargs.add_argument('--bound-pvalue', metavar="<float>", type=lambda x: restricted_float(x, 0, 1), help="Set p-value threshold for bound/unbound split (default: 0.001)", default=0.001)
optargs.add_argument('--pseudo', type=float, metavar="<float>", help="Pseudocount for calculating log2fcs (default: estimated from data)", default=None)
optargs.add_argument('--time-series', action='store_true', help="Will only compare signals1<->signals2<->signals3 (...) in order of input, and skip all-against-all comparison.")
optargs.add_argument('--skip-excel', action='store_true', help="Skip creation of excel files - for large datasets, this will speed up BINDetect considerably")

runargs = parser.add_argument_group("Run arguments")
runargs.add_argument('--outdir', metavar="<directory>", help="Output directory to place TFBS/plots in (default: bindetect_output)", default="bindetect_output")
Expand Down Expand Up @@ -204,6 +205,9 @@ def run_bindetect(args):
peak_chroms = peaks.get_chroms()
peak_columns = len(peaks[0]) #number of columns

if args.debug:
peaks = RegionList(peaks[:1000])

#Header
if args.peak_header != None:
content = open(args.peak_header, "r").read()
Expand Down Expand Up @@ -236,7 +240,6 @@ def run_bindetect(args):
bg = np.array([(1-args.gc)/2.0, args.gc/2.0, args.gc/2.0, (1-args.gc)/2.0])
logger.info("- GC content estimated at {0:.2f}%".format(gc_content*100))


################ Get motifs ################
logger.info("Reading motifs from file")
motif_list = MotifList()
Expand Down
42 changes: 26 additions & 16 deletions tobias/footprinting/bindetect_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
outfile = os.path.join(bed_outdir, "{0}_{1}_{2}.bed".format(TF_name, condition, state))
chosen_bool = 1 if state == "bound" else 0
bedlines_subset = [bedline for bedline in bedlines if bedline[condition + "_bound"] == chosen_bool]
bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
#bedlines_subset = sorted(bedlines_subset, key= lambda line: line[condition + "_score"], reverse=True)
dict_to_tab(bedlines_subset, outfile, chosen_columns)

##### Write overview with scores, bound and log2fcs ####
Expand All @@ -331,22 +331,27 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf

#Write xlsx overview
bed_table = pd.DataFrame(bedlines)
try:
overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter')
bed_table.to_excel(writer, index=False, columns=overview_columns)

worksheet = writer.sheets['Sheet1']
no_rows, no_cols = bed_table.shape
worksheet.autofilter(0,0,no_rows, no_cols-1)
writer.save()

except:
print("Error writing excelfile for TF {0}".format(TF_name))
sys.exit()

stime_excel = datetime.now()
if args.skip_excel == False:
try:
overview_excel = os.path.join(args.outdir, TF_name, TF_name + "_overview.xlsx")
writer = pd.ExcelWriter(overview_excel, engine='xlsxwriter') #, options=dict(constant_memory=True))
bed_table.to_excel(writer, index=False, columns=overview_columns)

#autfilter not possible with constant_memory
worksheet = writer.sheets['Sheet1']
no_rows, no_cols = bed_table.shape
worksheet.autofilter(0,0,no_rows, no_cols)
writer.save()

except Exception as e:
print("Error writing excelfile for TF {0}".format(TF_name))
print(e)
sys.exit()

etime_excel = datetime.now()
etime = datetime.now()
logger.debug("{0} - Local effects took:\t{1}".format(TF_name, etime - stime))
logger.debug("{0} - Local effects took:\t{1} (excel: {2})".format(TF_name, etime - stime, etime_excel - stime_excel))

############################## Global effects ##############################

Expand Down Expand Up @@ -402,6 +407,8 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
info_table.at[TF_name, base + "_change"] = 0
info_table.at[TF_name, base + "_pvalue"] = 1

#stime_plot = datetime.now()

#### Plot comparison ###
fig, ax = plt.subplots(1,1)
ax.hist(observed_log2fcs, bins='auto', label="Observed log2fcs", density=True)
Expand Down Expand Up @@ -433,6 +440,9 @@ def process_tfbs(TF_name, args, log2fc_params): #per tf
log2fc_pdf.savefig(fig, bbox_inches='tight')
plt.close(fig)

#etime_plot = datetime.now()
#logger.debug("{0} - Plotting took:\t{1}".format(TF_name, etime_plot - stime_plot))

log2fc_pdf.close()

etime = datetime.now()
Expand Down
15 changes: 14 additions & 1 deletion tobias/motifs/format_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,20 +68,30 @@ def run_formatmotifs(args):
motif_list = MotifList()
converted_content = ""
for f in motif_files:
logger.info("- {0}".format(f))
logger.debug("- {0}".format(f))
motif_list.extend(MotifList().from_file(f))

logger.info("Read {} motifs\n".format(len(motif_list)))

#Sort out duplicate motifs
all_motif_ids = [motif.id for motif in motif_list]
unique_motif_ids = set(all_motif_ids)
if len(all_motif_ids) != len(unique_motif_ids):
logger.info("Found duplicate motif ids in file - choosing first motif with unique id.")
motif_list = MotifList([motif_list[all_motif_ids.index(motifid)] for motifid in unique_motif_ids])
logger.info("Reduced to {0} unique motif ids".format(len(motif_list)))

### Filter motif list ###
if args.filter != None:

#Read filter
logger.info("Reading entries in {0}".format(args.filter))
entries = open(args.filter, "r").read().split()
logger.info("Read {0} unique filter values".format(len(set(entries))))

#Match to input motifs #print(entries)
logger.info("Matching motifs to filter")
used_filters = []
filtered_list = MotifList()
for input_motif in motif_list:
found_in_filter = 0
Expand All @@ -92,13 +102,16 @@ def run_formatmotifs(args):
filtered_list.append(input_motif)
logger.debug("Selected motif {0} ({1}) due to filter value {2}".format(input_motif.name, input_motif.id, entries[i]))
found_in_filter = 1
used_filters.append(entries[i])

if found_in_filter == 0:
logger.debug("Could not find any match to motif {0} ({1}) in filter".format(input_motif.name, input_motif.id))

logger.info("Filtered number of motifs from {0} to {1}".format(len(motif_list), len(filtered_list)))
motif_list = filtered_list

logger.debug("Filters not used: {0}".format(list(set(entries) - set(used_filters))))

#### Write out results ####
if args.task == "split":
logger.info("Writing individual files to directory {0}".format(args.output))
Expand Down
28 changes: 11 additions & 17 deletions tobias/plotting/plot_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def add_aggregate_arguments(parser):
PLOT.add_argument('--share-y', metavar="", help="Share y-axis range across plots (none/signals/sites/both). Use \"--share_y signals\" if bigwig signals have similar ranges. Use \"--share_y sites\" if sites per bigwig are comparable, but bigwigs themselves aren't comparable. (default: none)", choices=["none", "signals", "sites", "both"], default="none")

#Signals / regions
PLOT.add_argument('--norm-comparisons', action='store_true', help="Normalize the aggregate signal in comparison column/row to the same range (default: the true range is shown)")
PLOT.add_argument('--normalize', action='store_true', help="Normalize the aggregate signal to be in the same range (default: the true range of values is shown)")
PLOT.add_argument('--negate', action='store_true', help="Negate overlap with regions")
PLOT.add_argument('--log-transform', help="", action="store_true")
PLOT.add_argument('--plot-boundaries', help="Plot TFBS boundaries", action='store_true')
Expand Down Expand Up @@ -250,8 +250,12 @@ def run_aggregate(args):
signalmat_log = np.log2(signalmat_abs + 1)
signalmat_log[signalmat < 0] *= -1 #original negatives back to <0
signalmat = signalmat_log

aggregate = np.nanmean(signalmat, axis=0)

if args.normalize:
aggregate = preprocessing.minmax_scale(aggregate)

aggregate_dict[signal_name][region_name] = aggregate

signalmat = None
Expand All @@ -263,7 +267,7 @@ def run_aggregate(args):

logger.comment("")
logger.info("---- Analysis ----")

#Measure of footprint depth in comparison to baseline
logger.info("Calculating footprint depth measure")
logger.info("FPD (signal,regions): footprint_width baseline middle FPD")
Expand Down Expand Up @@ -430,28 +434,18 @@ def run_aggregate(args):
aggregate = aggregate_dict[signal_name][region_name]
axarr[row, col].plot(xvals, aggregate, color=colors[col+row], linewidth=1, label=signal_name)

aggregate_norm = preprocessing.minmax_scale(aggregate)

#Compare across rows and cols
if col_compare: #compare between different columns by adding one more column
if args.norm_comparisons:
aggregate_compare = aggregate_norm
else:
aggregate_compare = aggregate
axarr[row, -1].plot(xvals, aggregate_compare, color=colors[row+col], linewidth=1, alpha=0.8, label=col_names[col])
if col_compare: #compare between different columns by adding one more column
axarr[row, -1].plot(xvals, aggregate, color=colors[row+col], linewidth=1, alpha=0.8, label=col_names[col])
axarr[row, -1].legend(loc="lower right")

if row_compare: #compare between different rows by adding one more row
if args.norm_comparisons:
aggregate_compare = aggregate_norm
else:
aggregate_compare = aggregate
axarr[-1, col].plot(xvals, aggregate_compare, color=colors[row+col], linewidth=1, alpha=0.8, label=row_names[row])
axarr[-1, col].plot(xvals, aggregate, color=colors[row+col], linewidth=1, alpha=0.8, label=row_names[row])
axarr[-1, col].legend(loc="lower right")

#Diagonal comparison
if n_rows == n_cols and col_compare and row_compare and col == row:
axarr[-1, -1].plot(xvals, aggregate_compare, color=colors[row+col], linewidth=1, alpha=0.8)
axarr[-1, -1].plot(xvals, aggregate, color=colors[row+col], linewidth=1, alpha=0.8)

#Add number of sites to plot
axarr[row, col].text(0.98,0.98,str(len(regions_dict[region_name])), transform = axarr[row, col].transAxes, fontsize=12, va="top", ha="right")
Expand Down

0 comments on commit 4f346b5

Please sign in to comment.