Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
TOBIAS_snakemake/snakefiles/preprocessing.snake
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
206 lines (183 sloc)
8.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------------------------------------------------------ # | |
# ----------------------------------------------- Copy input files ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
rule copy_flatfiles: | |
input: | |
fasta = config['run_info']['fasta'], | |
blacklist = config['run_info']['blacklist'], | |
gtf = config['run_info']['gtf'] | |
output: | |
fasta = FASTA, | |
blacklist = BLACKLIST, | |
gtf = GTF | |
shell: | |
"cp {input.fasta} {output.fasta};" | |
"cp {input.blacklist} {output.blacklist};" | |
"cp {input.gtf} {output.gtf}" | |
#Get chromosomes available in fasta | |
rule get_fasta_chroms: | |
input: | |
rules.copy_flatfiles.output.fasta | |
output: | |
txt = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.txt"), | |
bed = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.bed") | |
shell: | |
"faidx {input} -i chromsizes > {output.txt};" | |
"awk '{{ print $1\"\t\"0\"\t\"$2 }}' {output.txt} > {output.bed}" | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# ---------------------------------------------- Config processing ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
#Write config to get an overview of input files | |
rule write_config: | |
output: | |
os.path.join(OUTPUTDIR, "config.yaml") | |
priority: | |
100 | |
run: | |
import yaml | |
with open(output[0], 'w') as yaml_file: | |
yaml.dump(config, yaml_file, default_flow_style=False) | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# ------------------------------------------------- Bam processing ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# Merge sample bam files to condition bam file | |
# if only one sample per condition, copy sample bam to merged bam file name for further processing | |
rule conditionbam: | |
input: | |
lambda wildcards: config["data"][wildcards.condition] | |
output: | |
bam = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"), | |
bai = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam.bai") | |
threads: | |
99 | |
message: | |
"Joining individual bamfiles from condition {wildcards.condition}" | |
run: | |
if len(input) > 1: | |
shell("samtools merge -@ {threads} {output.bam} {input}") | |
shell("samtools index {output.bam}") | |
else: | |
shell("cp {input} {output.bam}") | |
shell("samtools index {output.bam}") | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# --------------------------------------------------- Peak-calling ------------------------------------------------- # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# Peak-calling | |
gsizes = {"human":"hs", | |
"mouse":"mm", | |
"zebrafish": 1369631918} #https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html | |
rule macs: | |
input: | |
lambda wildcards: id2bam[wildcards.condition][wildcards.sample_id] | |
output: | |
macs = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_peaks.broadPeak"), | |
raw = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_raw.bed") | |
log: | |
os.path.join(OUTPUTDIR, "logs", "{condition}_{sample_id}_peak_calling.log") | |
message: | |
"Running macs2 with .bam-file: {input}" | |
conda: | |
os.path.join(environments_dir, "macs.yaml") | |
params: | |
"--name {sample_id}", | |
"--outdir " + os.path.join(OUTPUTDIR, "peak_calling", "{condition}"), | |
"--gsize " + str(gsizes[config["run_info"]["organism"]]), | |
config.get("macs", "--nomodel --shift -100 --extsize 200 --broad"), | |
shell: | |
"macs2 callpeak -t {input} {params} &> {log}; " | |
"cp {output.macs} {output.raw}; " | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# ------------------------------------------------- Peak-processing ------------------------------------------------ # | |
# ------------------------------------------------------------------------------------------------------------------ # | |
# process peaks: | |
# 1. reduce to genomic location columns and sort | |
# 2. merge peaks per condition | |
# 3. remove blacklisted regions and add unique peak ids | |
rule process_peaks: | |
input: | |
peaks = lambda wildcards: [os.path.join(OUTPUTDIR, "peak_calling", wildcards.condition, sample_id + "_raw.bed") for sample_id in id2bam[wildcards.condition].keys()], | |
blacklist = BLACKLIST, | |
whitelist = rules.get_fasta_chroms.output.bed | |
output: | |
peaks = os.path.join(OUTPUTDIR, "peak_calling", "{condition}_union.bed") | |
message: "Processing peaks from condition {wildcards.condition}" | |
shell: | |
"cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | " | |
"bedtools subtract -a - -b {input.blacklist} -A | " | |
"bedtools intersect -a - -b {input.whitelist} -wa | " | |
"awk '$1 !~ /[M]/' | " #exclude chromosome M | |
"awk '{{print $s\"\\t{wildcards.condition}\"}}' > {output.peaks}" #add condition name to each peak | |
# Union peaks across all conditions | |
rule merge_condition_peaks: | |
input: | |
[os.path.join(OUTPUTDIR, "peak_calling", condition + "_union.bed") for condition in CONDITION_IDS] | |
output: | |
temp(os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp")) | |
message: | |
"Merging peaks across conditions" | |
shell: | |
"cat {input} | sort -k1,1 -k2,2n | bedtools merge -d 5 -c 4 -o distinct > {output}" | |
#Get correct sorting of peak_names | |
rule sort_peak_names: | |
input: | |
rules.merge_condition_peaks.output | |
output: | |
peaks = os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed") | |
run: | |
out = open(output[0], "w") | |
with open(input[0]) as f: | |
for line in f: | |
columns = line.rstrip().split("\t") | |
#Sort according to condition names | |
peak_ids = columns[3].split(",") | |
columns[3] = ",".join(sorted(peak_ids, key= lambda x: CONDITION_IDS.index(x))) | |
out.write("\t".join(columns) + "\n") | |
out.close() | |
#Config for uropa annotation | |
rule uropa_config: | |
input: | |
bed = rules.sort_peak_names.output.peaks, #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"), | |
gtf = GTF | |
output: | |
config = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config") | |
run: | |
import json | |
config = {"queries":[ | |
{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "name":"protein_coding_promoter"}, | |
{"feature":"gene", "distance":1, "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "internals":0.1, "name":"protein_coding_internal"}, | |
{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "name":"any_promoter"}, | |
{"feature":"gene", "distance":1, "internals":0.1, "name":"any_internal"}, | |
{"feature":"gene", "distance":[50000, 50000], "name":"distal_enhancer"}, | |
], | |
"show_attributes":["gene_biotype", "gene_id", "gene_name"], | |
"priority":"True" | |
} | |
config["gtf"] = input.gtf | |
config["bed"] = input.bed | |
string_config = json.dumps(config, indent=4) | |
config_file = open(output[0], "w") | |
config_file.write(string_config) | |
config_file.close() | |
# Peak annotation | |
# peaks per condition or across conditions, dependent on run info output | |
rule uropa: | |
input: | |
config = rules.uropa_config.output.config #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config") | |
output: | |
finalhits = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits.txt"), | |
finalhits_sub = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits_sub.txt"), | |
peaks = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"), | |
header = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt"), | |
threads: | |
99 | |
log: | |
os.path.join(OUTPUTDIR, "logs", "uropa.log") | |
params: | |
prefix = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated") | |
conda: | |
os.path.join(environments_dir, "uropa.yaml") | |
shell: | |
"uropa --input {input.config} --prefix {params.prefix} --threads {threads} --log {log}; " | |
"cut -f 1-4,7-13,16-19 {output.finalhits} > {output.finalhits_sub}; " #Get a subset of columns | |
"head -n 1 {output.finalhits_sub} > {output.header};" #header | |
"tail -n +2 {output.finalhits_sub} > {output.peaks}" #bedlines | |