Skip to content
Permalink
b348eb7823
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
193 lines (170 sloc) 7.99 KB
# ------------------------------------------------------------------------------------------------------------------ #
# ----------------------------------------------- Copy input files ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #
rule copy_flatfiles:
input:
fasta = config['run_info']['fasta'],
blacklist = config['run_info']['blacklist'],
gtf = config['run_info']['gtf']
output:
fasta = FASTA,
blacklist = BLACKLIST,
gtf = GTF
shell:
"cp {input.fasta} {output.fasta};"
"cp {input.blacklist} {output.blacklist};"
"cp {input.gtf} {output.gtf}"
# ------------------------------------------------------------------------------------------------------------------ #
# ---------------------------------------------- Config processing ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #
#Write config to get an overview of input files
rule write_config:
output:
os.path.join(OUTPUTDIR, "config.yaml")
priority:
100
run:
import yaml
with open(output[0], 'w') as yaml_file:
yaml.dump(config, yaml_file, default_flow_style=False)
# ------------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------- Bam processing ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #
# Merge sample bam files to condition bam file
# if only one sample per condition, copy sample bam to merged bam file name for further processing
rule conditionbam:
input:
lambda wildcards: config["data"][wildcards.condition]
output:
bam = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"),
bai = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam.bai")
threads:
99
message:
"Joining individual bamfiles from condition {wildcards.condition}"
run:
if len(input) > 1:
shell("samtools merge -@ {threads} {output.bam} {input}")
shell("samtools index {output.bam}")
else:
shell("cp {input} {output.bam}")
shell("samtools index {output.bam}")
# ------------------------------------------------------------------------------------------------------------------ #
# --------------------------------------------------- Peak-calling ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #
# Peak-calling
gsizes = {"human":"hs",
"mouse":"mm",
"zebrafish": 1369631918} #https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html
rule macs:
input:
lambda wildcards: id2bam[wildcards.condition][wildcards.sample_id]
output:
macs = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_peaks.broadPeak"),
raw = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_raw.bed")
log:
os.path.join(OUTPUTDIR, "logs", "{condition}_{sample_id}_peak_calling.log")
message:
"Running macs2 with .bam-file: {input}"
conda:
os.path.join(environments_dir, "macs.yaml")
params:
"--name {sample_id}",
"--outdir " + os.path.join(OUTPUTDIR, "peak_calling", "{condition}"),
"--gsize " + str(gsizes[config["run_info"]["organism"]]),
config.get("macs", "--nomodel --shift -100 --extsize 200 --broad"),
shell:
"macs2 callpeak -t {input} {params} &> {log}; "
"cp {output.macs} {output.raw}; "
# ------------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------- Peak-processing ------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------------ #
# process peaks:
# 1. reduce to genomic location columns and sort
# 2. merge peaks per condition
# 3. remove blacklisted regions and add unique peak ids
rule process_peaks:
input:
peaks = lambda wildcards: [os.path.join(OUTPUTDIR, "peak_calling", wildcards.condition, sample_id + "_raw.bed") for sample_id in id2bam[wildcards.condition].keys()],
blacklisted = BLACKLIST
output:
peaks = os.path.join(OUTPUTDIR, "peak_calling", "{condition}_union.bed")
message: "Processing peaks from condition {wildcards.condition}"
shell:
"cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | bedtools subtract -a - -b {input.blacklisted} -A | "
"awk '$1 !~ /[M]/' | "
"awk '{{print $s\"\\t{wildcards.condition}\"}}' > {output.peaks}" #add condition name to each peak
# Union peaks across all conditions
rule merge_condition_peaks:
input:
[os.path.join(OUTPUTDIR, "peak_calling", condition + "_union.bed") for condition in CONDITION_IDS]
output:
temp(os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp"))
message:
"Merging peaks across conditions"
shell:
"cat {input} | sort -k1,1 -k2,2n | bedtools merge -d 5 -c 4 -o distinct > {output}"
#Get correct sorting of peak_names for
rule sort_peak_names:
input:
rules.merge_condition_peaks.output #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp")
output:
peaks = os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed")
run:
out = open(output[0], "w")
with open(input[0]) as f:
for line in f:
columns = line.rstrip().split("\t")
#Sort according to condition names
peak_ids = columns[3].split(",")
columns[3] = ",".join(sorted(peak_ids, key= lambda x: CONDITION_IDS.index(x)))
out.write("\t".join(columns) + "\n")
out.close()
#Config for uropa annotation
rule uropa_config:
input:
bed = rules.sort_peak_names.output.peaks, #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"),
gtf = GTF
output:
config = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config")
run:
import json
config = {"queries":[
{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "name":"protein_coding_promoter"},
{"feature":"gene", "distance":1, "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "internals":0.1, "name":"protein_coding_internal"},
{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "name":"any_promoter"},
{"feature":"gene", "distance":1, "internals":0.1, "name":"any_internal"},
{"feature":"gene", "distance":[50000, 50000], "name":"distal_enhancer"},
],
"show_attributes":["gene_biotype", "gene_id", "gene_name"],
"priority":"True"
}
config["gtf"] = input.gtf
config["bed"] = input.bed
string_config = json.dumps(config, indent=4)
config_file = open(output[0], "w")
config_file.write(string_config)
config_file.close()
# Peak annotation
# peaks per condition or across conditions, dependent on run info output
rule uropa:
input:
config = rules.uropa_config.output.config #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config")
output:
finalhits = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits.txt"),
finalhits_sub = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits_sub.txt"),
peaks = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"),
header = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt"),
threads:
99
log:
os.path.join(OUTPUTDIR, "logs", "uropa.log")
params:
prefix = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated")
conda:
os.path.join(environments_dir, "uropa.yaml")
shell:
"uropa --input {input.config} --prefix {params.prefix} --threads {threads} --log {log}; "
"cut -f 1-4,7-13,16-19 {output.finalhits} > {output.finalhits_sub}; " #Get a subset of columns
"head -n 1 {output.finalhits_sub} > {output.header};" #header
"tail -n +2 {output.finalhits_sub} > {output.peaks}" #bedlines