Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
TOBIAS_snakemake/snakefiles/preprocessing.snake
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
193 lines (170 sloc)
7.99 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------------------------------------------------------ # | |
# ----------------------------------------------- Copy input files ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
rule copy_flatfiles: | |
input: | |
fasta = config['run_info']['fasta'], | |
blacklist = config['run_info']['blacklist'], | |
gtf = config['run_info']['gtf'] | |
output: | |
fasta = FASTA, | |
blacklist = BLACKLIST, | |
gtf = GTF | |
shell: | |
"cp {input.fasta} {output.fasta};" | |
"cp {input.blacklist} {output.blacklist};" | |
"cp {input.gtf} {output.gtf}" | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# ---------------------------------------------- Config processing ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
#Write config to get an overview of input files | |
rule write_config: | |
output: | |
os.path.join(OUTPUTDIR, "config.yaml") | |
priority: | |
100 | |
run: | |
import yaml | |
with open(output[0], 'w') as yaml_file: | |
yaml.dump(config, yaml_file, default_flow_style=False) | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# ------------------------------------------------- Bam processing ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# Merge sample bam files to condition bam file | |
# if only one sample per condition, copy sample bam to merged bam file name for further processing | |
rule conditionbam: | |
input: | |
lambda wildcards: config["data"][wildcards.condition] | |
output: | |
bam = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"), | |
bai = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam.bai") | |
threads: | |
99 | |
message: | |
"Joining individual bamfiles from condition {wildcards.condition}" | |
run: | |
if len(input) > 1: | |
shell("samtools merge -@ {threads} {output.bam} {input}") | |
shell("samtools index {output.bam}") | |
else: | |
shell("cp {input} {output.bam}") | |
shell("samtools index {output.bam}") | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# --------------------------------------------------- Peak-calling ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# Peak-calling | |
gsizes = {"human":"hs", | |
"mouse":"mm", | |
"zebrafish": 1369631918} #https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html | |
rule macs: | |
input: | |
lambda wildcards: id2bam[wildcards.condition][wildcards.sample_id] | |
output: | |
macs = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_peaks.broadPeak"), | |
raw = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_raw.bed") | |
log: | |
os.path.join(OUTPUTDIR, "logs", "{condition}_{sample_id}_peak_calling.log") | |
message: | |
"Running macs2 with .bam-file: {input}" | |
conda: | |
os.path.join(environments_dir, "macs.yaml") | |
params: | |
"--name {sample_id}", | |
"--outdir " + os.path.join(OUTPUTDIR, "peak_calling", "{condition}"), | |
"--gsize " + str(gsizes[config["run_info"]["organism"]]), | |
config.get("macs", "--nomodel --shift -100 --extsize 200 --broad"), | |
shell: | |
"macs2 callpeak -t {input} {params} &> {log}; " | |
"cp {output.macs} {output.raw}; " | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# ------------------------------------------------- Peak-processing ------------------------------------------------ # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# process peaks: | |
# 1. reduce to genomic location columns and sort | |
# 2. merge peaks per condition | |
# 3. remove blacklisted regions and add unique peak ids | |
rule process_peaks: | |
input: | |
peaks = lambda wildcards: [os.path.join(OUTPUTDIR, "peak_calling", wildcards.condition, sample_id + "_raw.bed") for sample_id in id2bam[wildcards.condition].keys()], | |
blacklisted = BLACKLIST | |
output: | |
peaks = os.path.join(OUTPUTDIR, "peak_calling", "{condition}_union.bed") | |
message: "Processing peaks from condition {wildcards.condition}" | |
shell: | |
"cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | bedtools subtract -a - -b {input.blacklisted} -A | " | |
"awk '$1 !~ /[M]/' | " | |
"awk '{{print $s\"\\t{wildcards.condition}\"}}' > {output.peaks}" #add condition name to each peak | |
# Union peaks across all conditions | |
rule merge_condition_peaks: | |
input: | |
[os.path.join(OUTPUTDIR, "peak_calling", condition + "_union.bed") for condition in CONDITION_IDS] | |
output: | |
temp(os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp")) | |
message: | |
"Merging peaks across conditions" | |
shell: | |
"cat {input} | sort -k1,1 -k2,2n | bedtools merge -d 5 -c 4 -o distinct > {output}" | |
#Get correct sorting of peak_names for | |
rule sort_peak_names: | |
input: | |
rules.merge_condition_peaks.output #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp") | |
output: | |
peaks = os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed") | |
run: | |
out = open(output[0], "w") | |
with open(input[0]) as f: | |
for line in f: | |
columns = line.rstrip().split("\t") | |
#Sort according to condition names | |
peak_ids = columns[3].split(",") | |
columns[3] = ",".join(sorted(peak_ids, key= lambda x: CONDITION_IDS.index(x))) | |
out.write("\t".join(columns) + "\n") | |
out.close() | |
#Config for uropa annotation | |
rule uropa_config: | |
input: | |
bed = rules.sort_peak_names.output.peaks, #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"), | |
gtf = GTF | |
output: | |
config = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config") | |
run: | |
import json | |
config = {"queries":[ | |
{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "name":"protein_coding_promoter"}, | |
{"feature":"gene", "distance":1, "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "internals":0.1, "name":"protein_coding_internal"}, | |
{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "name":"any_promoter"}, | |
{"feature":"gene", "distance":1, "internals":0.1, "name":"any_internal"}, | |
{"feature":"gene", "distance":[50000, 50000], "name":"distal_enhancer"}, | |
], | |
"show_attributes":["gene_biotype", "gene_id", "gene_name"], | |
"priority":"True" | |
} | |
config["gtf"] = input.gtf | |
config["bed"] = input.bed | |
string_config = json.dumps(config, indent=4) | |
config_file = open(output[0], "w") | |
config_file.write(string_config) | |
config_file.close() | |
# Peak annotation | |
# peaks per condition or across conditions, dependent on run info output | |
rule uropa: | |
input: | |
config = rules.uropa_config.output.config #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config") | |
output: | |
finalhits = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits.txt"), | |
finalhits_sub = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits_sub.txt"), | |
peaks = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"), | |
header = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt"), | |
threads: | |
99 | |
log: | |
os.path.join(OUTPUTDIR, "logs", "uropa.log") | |
params: | |
prefix = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated") | |
conda: | |
os.path.join(environments_dir, "uropa.yaml") | |
shell: | |
"uropa --input {input.config} --prefix {params.prefix} --threads {threads} --log {log}; " | |
"cut -f 1-4,7-13,16-19 {output.finalhits} > {output.finalhits_sub}; " #Get a subset of columns | |
"head -n 1 {output.finalhits_sub} > {output.header};" #header | |
"tail -n +2 {output.finalhits_sub} > {output.peaks}" #bedlines | |