Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
TOBIAS_snakemake/snakefiles/footprinting.snake
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
149 lines (134 sloc)
6.56 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# vim: syntax=python tabstop=4 expandtab | |
# coding: utf-8 | |
#--------------------------------------------------------------------------------------------------------# | |
#Copy motif files to folder | |
rule copy_motifs: | |
input: | |
config["run_info"]["motifs"] #list of files | |
output: | |
[os.path.join(OUTPUTDIR, "motifs", "individual", os.path.basename(f)) for f in config["run_info"]["motifs"]] | |
params: | |
outdir = os.path.join(OUTPUTDIR, "motifs", "individual") | |
run: | |
for f in input: | |
shell("cp {0} {1}".format(f, params.outdir)) | |
#Format motifs to one file | |
rule format_motifs: | |
input: | |
rules.copy_motifs.output #this is a directory | |
output: | |
joined = os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt") | |
priority: | |
2 | |
params: | |
motifdir = os.path.join(OUTPUTDIR, "motifs", "individual") | |
log: | |
os.path.join(OUTPUTDIR, "logs", "format_motifs.log") | |
shell: | |
"TOBIAS FormatMotifs --input {params.motifdir} --format pfm --task join --output {output} &>> {log}" | |
#--------------------------------------------------------------------------------------------------------# | |
#Correct reads for Tn5 sequence bias | |
rule atacorrect: | |
input: | |
bam = rules.conditionbam.output.bam, #os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"), | |
peaks = rules.sort_peak_names.output, #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"), | |
genome = FASTA | |
output: | |
uncorrected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_uncorrected.bw"), | |
bias = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_bias.bw"), | |
expected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_expected.bw"), | |
corrected = os.path.join(OUTPUTDIR, "bias_correction", "{condition}_corrected.bw"), | |
params: | |
"--blacklist " + BLACKLIST if BLACKLIST != "" else "", | |
"--outdir " + os.path.join(OUTPUTDIR, "bias_correction"), | |
"--prefix " + "{condition}", | |
config.get("atacorrect", "") | |
priority: | |
2 | |
threads: | |
99 #unless there are more than 99 cores, this rule will run on max threads | |
log: | |
os.path.join(OUTPUTDIR, "logs", "{condition}_atacorrect.log") | |
message: | |
"Running ATACorrect for condition {wildcards.condition} ({input.bam})" | |
shell: | |
"TOBIAS ATACorrect -b {input.bam} -g {input.genome} -p {input.peaks} --cores {threads} {params} &> {log}" | |
#--------------------------------------------------------------------------------------------------------# | |
#Calculate footprint scores per condition | |
rule footprinting: | |
input: | |
signal = rules.atacorrect.output.corrected, #os.path.join(OUTPUTDIR, "bias_correction", "{condition}_corrected.bw"), | |
regions = rules.sort_peak_names.output #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed") | |
output: | |
footprints = os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), | |
params: | |
config.get("footprinting", "") | |
priority: | |
2 | |
threads: | |
99 | |
log: | |
os.path.join(OUTPUTDIR, "logs", "{condition}_footprinting.log") | |
message: | |
"Running footprinting for condition {wildcards.condition} ({input.signal})" | |
shell: | |
"TOBIAS FootprintScores --signal {input.signal} --regions {input.regions} --output {output.footprints} --cores {threads} {params} &> {log}" | |
#--------------------------------------------------------------------------------------------------------# | |
#Estimate bound sites from scored file | |
checkpoint bindetect: | |
input: | |
motifs = rules.format_motifs.output, #os.path.join(OUTPUTDIR, "motifs", "all_motifs.txt"), | |
footprints = expand(rules.footprinting.output.footprints, condition=CONDITION_IDS), #os.path.join(OUTPUTDIR, "footprinting", "{condition}_footprints.bw"), condition=CONDITION_IDS), | |
genome = FASTA, | |
peaks = rules.uropa.output.peaks, #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"), | |
peak_header = rules.uropa.output.header #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt") | |
output: | |
directory(os.path.join(OUTPUTDIR, "TFBS")) | |
#os.path.join(OUTPUTDIR, "overview", "bindetect_results.txt") | |
#bound_files = dynamic(expand(os.path.join(OUTPUTDIR, "TFBS", "{{TF}}", "beds", "{{TF}}_{condition}_bound.bed"), condition=CONDITION_IDS)), #, TF={TF}"), | |
#unbound_files = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_{condition}_unbound.bed"), condition=CONDITION_IDS, TF=TF_IDS), | |
#all_files = expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_all.bed"), TF=TF_IDS), | |
#overview = dynamic(expand(os.path.join(OUTPUTDIR, "TFBS", "{{TF}}", "{{TF}}_overview.txt"))), | |
#figures = os.path.join(OUTPUTDIR, "TFBS", "bindetect_figures.pdf"), | |
#local = [os.path.join(OUTPUTDIR, "TFBS", fil) for fil in ["bindetect_results.txt", "bindetect_results.xlsx"]], | |
threads: 99 | |
priority: 2 | |
log: | |
os.path.join(OUTPUTDIR, "logs", "bindetect.log") | |
params: | |
config.get("bindetect", ""), | |
"--cond_names " + " ".join(CONDITION_IDS), | |
message: | |
"Running BINDetect" | |
shell: | |
"TOBIAS BINDetect --motifs {input.motifs} --signals {input.footprints} --genome {input.genome} " | |
"--peaks {input.peaks} --peak_header {input.peak_header} --cores {threads} --outdir {output} {params} &> {log}; " | |
"mkdir -p " + os.path.join(OUTPUTDIR, "overview") + ";" | |
"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.txt") + " " + os.path.join(OUTPUTDIR, "overview") + ";" #move files to overview | |
"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.xlsx") + " " + os.path.join(OUTPUTDIR, "overview") + ";" | |
"cp " + os.path.join(OUTPUTDIR, "TFBS", "*.pdf") + " " + os.path.join(OUTPUTDIR, "overview") + ";" | |
#--------------------------------------------------------------------------------------------------------# | |
def get_TF_ids(wildcards): | |
bindetect_output = checkpoints.bindetect.get(**wildcards).output[0] | |
TF_IDS = glob_wildcards(os.path.join(bindetect_output, "{TF}", "beds")).TF #wildcard for each TF from dir name | |
return(TF_IDS) | |
#--------------------------------------------------------------------------------------------------------# | |
#Join bound estimates per condition | |
rule join_bound: | |
input: | |
lambda wildcards: expand(os.path.join(OUTPUTDIR, "TFBS", "{TF}", "beds", "{TF}_{{condition}}_bound.bed"), TF=get_TF_ids(wildcards)), | |
output: | |
unsorted = temp(os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.tmp")), | |
final = os.path.join(OUTPUTDIR, "overview", "all_{condition}_bound.bed") | |
priority: | |
2 | |
threads: | |
99 #Make sure that only one join_bound runs at a time; due to memory issues | |
run: | |
n = 100 #chunks of 100 files | |
shell("> " + output[0]) | |
file_chunks = [input[i:i+n] for i in range(0,len(input),n)] | |
for chunk in file_chunks: | |
shell("cat {0} | bedtools sort >> {1}".format(" ".join(chunk), output.unsorted)) | |
shell("sort -k 1,1 -k2,2 -n {output.unsorted} > {output.final}") | |
shell("igvtools index {output.final}") |