preprocessing.snake

# ------------------------------------------------------------------------------------------------------------------ #
# ----------------------------------------------- Copy input files ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #

rule copy_flatfiles:
	input:
		fasta = config['run_info']['fasta'],
		blacklist = config['run_info']['blacklist'],
		gtf = config['run_info']['gtf']
	output:
		fasta = FASTA,
		blacklist = BLACKLIST,
		gtf = GTF
	shell:
		"cp {input.fasta} {output.fasta};"
		"cp {input.blacklist} {output.blacklist};"
		"cp {input.gtf} {output.gtf}"

#Get chromosomes available in fasta
rule get_fasta_chroms:
	input:
		rules.copy_flatfiles.output.fasta
	output:
		txt = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.txt"),
		bed = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.bed")
	shell:
		"faidx {input} -i chromsizes > {output.txt};"
		"awk '{{ print $1\"\t\"0\"\t\"$2 }}' {output.txt} > {output.bed}"

# ------------------------------------------------------------------------------------------------------------------ #
# ---------------------------------------------- Config processing ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #

#Write config to get an overview of input files
rule write_config:
	output:
		os.path.join(OUTPUTDIR, "config.yaml")
	priority:
		100
	run:
		import yaml
		with open(output[0], 'w') as yaml_file:
			yaml.dump(config, yaml_file, default_flow_style=False)

# ------------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------- Bam processing ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #

# Merge sample bam files to condition bam file
# if only one sample per condition, copy sample bam to merged bam file name for further processing
rule conditionbam:
	input:
		lambda wildcards: config["data"][wildcards.condition]
	output:
		bam = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"),
		bai = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam.bai")
	threads:
		99
	message:
		"Joining individual bamfiles from condition {wildcards.condition}"
	run:
		if len(input) > 1:
			shell("samtools merge -@ {threads} {output.bam} {input}")
			shell("samtools index {output.bam}")
		else:
			shell("cp {input} {output.bam}")
			shell("samtools index {output.bam}")


# ------------------------------------------------------------------------------------------------------------------ #
# --------------------------------------------------- Peak-calling ------------------------------------------------- #
# ------------------------------------------------------------------------------------------------------------------ #

# Peak-calling
gsizes = {"human":"hs",
			"mouse":"mm",
			"zebrafish": 1369631918}  #https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html

rule macs:
	input:
		lambda wildcards: id2bam[wildcards.condition][wildcards.sample_id]
	output:
		macs = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_peaks.broadPeak"),
		raw = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_raw.bed")
	log:
		os.path.join(OUTPUTDIR, "logs", "{condition}_{sample_id}_peak_calling.log")
	message:
		"Running macs2 with .bam-file: {input}"
	conda:
		os.path.join(environments_dir, "macs.yaml")
	params:
		"--name {sample_id}",
		"--outdir " + os.path.join(OUTPUTDIR, "peak_calling", "{condition}"),
		"--gsize " + str(gsizes[config["run_info"]["organism"]]),
		config.get("macs", "--nomodel --shift -100 --extsize 200 --broad"),
	shell:
		"macs2 callpeak -t {input} {params} &> {log}; "
		"cp {output.macs} {output.raw}; "


# ------------------------------------------------------------------------------------------------------------------ #
# ------------------------------------------------- Peak-processing ------------------------------------------------ #
# ------------------------------------------------------------------------------------------------------------------ #
# process peaks:
# 1. reduce to genomic location columns and sort
# 2. merge peaks per condition
# 3. remove blacklisted regions and add unique peak ids
rule process_peaks:
	input:
		peaks = lambda wildcards: [os.path.join(OUTPUTDIR, "peak_calling", wildcards.condition, sample_id + "_raw.bed") for sample_id in id2bam[wildcards.condition].keys()],
		blacklist = BLACKLIST,
		whitelist = rules.get_fasta_chroms.output.bed
	output:
		peaks = os.path.join(OUTPUTDIR, "peak_calling", "{condition}_union.bed")
	message: "Processing peaks from condition {wildcards.condition}"
	shell:
		"cat {input.peaks} | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge -d 5 | "
		"bedtools subtract -a - -b {input.blacklist} -A | "
		"bedtools intersect -a - -b {input.whitelist} -wa | "
		"awk '$1 !~ /[M]/' | "	#exclude chromosome M
		"awk '{{print $s\"\\t{wildcards.condition}\"}}' > {output.peaks}"	#add condition name to each peak

# Union peaks across all conditions
rule merge_condition_peaks:
	input:
		[os.path.join(OUTPUTDIR, "peak_calling", condition + "_union.bed") for condition in CONDITION_IDS]
	output:
		temp(os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp"))
	message:
		"Merging peaks across conditions"
	shell:
		"cat {input} | sort -k1,1 -k2,2n | bedtools merge -d 5 -c 4 -o distinct > {output}"

#Get correct sorting of peak_names
rule sort_peak_names:
	input:
		rules.merge_condition_peaks.output
	output:
		peaks = os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed")
	run:
		out = open(output[0], "w")
		with open(input[0]) as f:
			for line in f:
				columns = line.rstrip().split("\t")

				#Sort according to condition names
				peak_ids = columns[3].split(",")
				columns[3] = ",".join(sorted(peak_ids, key= lambda x: CONDITION_IDS.index(x)))

				out.write("\t".join(columns) + "\n")
		out.close()

#Config for uropa annotation
rule uropa_config:
	input:
		bed = rules.sort_peak_names.output.peaks, 	#os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"),
		gtf = GTF
	output:
		config = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config")
	run:

		import json
		config = {"queries":[
					{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "name":"protein_coding_promoter"},
					{"feature":"gene", "distance":1, "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "internals":0.1, "name":"protein_coding_internal"},
					{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "name":"any_promoter"},
					{"feature":"gene", "distance":1, "internals":0.1, "name":"any_internal"},
					{"feature":"gene", "distance":[50000, 50000], "name":"distal_enhancer"},
					],
				"show_attributes":["gene_biotype", "gene_id", "gene_name"],
				"priority":"True"
				}

		config["gtf"] = input.gtf
		config["bed"] = input.bed

		string_config = json.dumps(config, indent=4)

		config_file = open(output[0], "w")
		config_file.write(string_config)
		config_file.close()

# Peak annotation
# peaks per condition or across conditions, dependent on run info output
rule uropa:
	input:
		config = rules.uropa_config.output.config 	#os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config")
	output:
		finalhits = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits.txt"),
		finalhits_sub = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits_sub.txt"),
		peaks = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"),
		header = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt"),
	threads:
		99
	log:
		os.path.join(OUTPUTDIR, "logs", "uropa.log")
	params:
		prefix = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated")
	conda:
		os.path.join(environments_dir, "uropa.yaml")
	shell:
		"uropa --input {input.config} --prefix {params.prefix} --threads {threads} --log {log}; "
		"cut -f 1-4,7-13,16-19 {output.finalhits} > {output.finalhits_sub}; " #Get a subset of columns
		"head -n 1 {output.finalhits_sub} > {output.header};"  #header
		"tail -n +2 {output.finalhits_sub} > {output.peaks}"   #bedlines
	# ------------------------------------------------------------------------------------------------------------------ #
	# ----------------------------------------------- Copy input files ------------------------------------------------- #
	# ------------------------------------------------------------------------------------------------------------------ #

	rule copy_flatfiles:
	input:
	fasta = config['run_info']['fasta'],
	blacklist = config['run_info']['blacklist'],
	gtf = config['run_info']['gtf']
	output:
	fasta = FASTA,
	blacklist = BLACKLIST,
	gtf = GTF
	shell:
	"cp {input.fasta} {output.fasta};"
	"cp {input.blacklist} {output.blacklist};"
	"cp {input.gtf} {output.gtf}"

	#Get chromosomes available in fasta
	rule get_fasta_chroms:
	input:
	rules.copy_flatfiles.output.fasta
	output:
	txt = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.txt"),
	bed = os.path.join(OUTPUTDIR, "flatfiles", "chromsizes.bed")
	shell:
	"faidx {input} -i chromsizes > {output.txt};"
	"awk '{{ print $1\"\t\"0\"\t\"$2 }}' {output.txt} > {output.bed}"

	# ------------------------------------------------------------------------------------------------------------------ #
	# ---------------------------------------------- Config processing ------------------------------------------------- #
	# ------------------------------------------------------------------------------------------------------------------ #

	#Write config to get an overview of input files
	rule write_config:
	output:
	os.path.join(OUTPUTDIR, "config.yaml")
	priority:
	100
	run:
	import yaml
	with open(output[0], 'w') as yaml_file:
	yaml.dump(config, yaml_file, default_flow_style=False)

	# ------------------------------------------------------------------------------------------------------------------ #
	# ------------------------------------------------- Bam processing ------------------------------------------------- #
	# ------------------------------------------------------------------------------------------------------------------ #

	# Merge sample bam files to condition bam file
	# if only one sample per condition, copy sample bam to merged bam file name for further processing
	rule conditionbam:
	input:
	lambda wildcards: config["data"][wildcards.condition]
	output:
	bam = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam"),
	bai = os.path.join(OUTPUTDIR, "mapping", "{condition}.bam.bai")
	threads:
	99
	message:
	"Joining individual bamfiles from condition {wildcards.condition}"
	run:
	if len(input) > 1:
	shell("samtools merge -@ {threads} {output.bam} {input}")
	shell("samtools index {output.bam}")
	else:
	shell("cp {input} {output.bam}")
	shell("samtools index {output.bam}")


	# ------------------------------------------------------------------------------------------------------------------ #
	# --------------------------------------------------- Peak-calling ------------------------------------------------- #
	# ------------------------------------------------------------------------------------------------------------------ #

	# Peak-calling
	gsizes = {"human":"hs",
	"mouse":"mm",
	"zebrafish": 1369631918} #https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html

	rule macs:
	input:
	lambda wildcards: id2bam[wildcards.condition][wildcards.sample_id]
	output:
	macs = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_peaks.broadPeak"),
	raw = os.path.join(OUTPUTDIR, "peak_calling", "{condition}", "{sample_id}_raw.bed")
	log:
	os.path.join(OUTPUTDIR, "logs", "{condition}_{sample_id}_peak_calling.log")
	message:
	"Running macs2 with .bam-file: {input}"
	conda:
	os.path.join(environments_dir, "macs.yaml")
	params:
	"--name {sample_id}",
	"--outdir " + os.path.join(OUTPUTDIR, "peak_calling", "{condition}"),
	"--gsize " + str(gsizes[config["run_info"]["organism"]]),
	config.get("macs", "--nomodel --shift -100 --extsize 200 --broad"),
	shell:
	"macs2 callpeak -t {input} {params} &> {log}; "
	"cp {output.macs} {output.raw}; "


	# ------------------------------------------------------------------------------------------------------------------ #
	# ------------------------------------------------- Peak-processing ------------------------------------------------ #
	# ------------------------------------------------------------------------------------------------------------------ #
	# process peaks:
	# 1. reduce to genomic location columns and sort
	# 2. merge peaks per condition
	# 3. remove blacklisted regions and add unique peak ids
	rule process_peaks:
	input:
	peaks = lambda wildcards: [os.path.join(OUTPUTDIR, "peak_calling", wildcards.condition, sample_id + "_raw.bed") for sample_id in id2bam[wildcards.condition].keys()],
	blacklist = BLACKLIST,
	whitelist = rules.get_fasta_chroms.output.bed
	output:
	peaks = os.path.join(OUTPUTDIR, "peak_calling", "{condition}_union.bed")
	message: "Processing peaks from condition {wildcards.condition}"
	shell:
	"cat {input.peaks} \| cut -f1-3 \| sort -k1,1 -k2,2n \| bedtools merge -d 5 \| "
	"bedtools subtract -a - -b {input.blacklist} -A \| "
	"bedtools intersect -a - -b {input.whitelist} -wa \| "
	"awk '$1 !~ /[M]/' \| " #exclude chromosome M
	"awk '{{print $s\"\\t{wildcards.condition}\"}}' > {output.peaks}" #add condition name to each peak

	# Union peaks across all conditions
	rule merge_condition_peaks:
	input:
	[os.path.join(OUTPUTDIR, "peak_calling", condition + "_union.bed") for condition in CONDITION_IDS]
	output:
	temp(os.path.join(OUTPUTDIR, "peak_calling", "all_merged.tmp"))
	message:
	"Merging peaks across conditions"
	shell:
	"cat {input} \| sort -k1,1 -k2,2n \| bedtools merge -d 5 -c 4 -o distinct > {output}"

	#Get correct sorting of peak_names
	rule sort_peak_names:
	input:
	rules.merge_condition_peaks.output
	output:
	peaks = os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed")
	run:
	out = open(output[0], "w")
	with open(input[0]) as f:
	for line in f:
	columns = line.rstrip().split("\t")

	#Sort according to condition names
	peak_ids = columns[3].split(",")
	columns[3] = ",".join(sorted(peak_ids, key= lambda x: CONDITION_IDS.index(x)))

	out.write("\t".join(columns) + "\n")
	out.close()

	#Config for uropa annotation
	rule uropa_config:
	input:
	bed = rules.sort_peak_names.output.peaks, #os.path.join(OUTPUTDIR, "peak_calling", "all_merged.bed"),
	gtf = GTF
	output:
	config = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config")
	run:

	import json
	config = {"queries":[
	{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "name":"protein_coding_promoter"},
	{"feature":"gene", "distance":1, "filter_attribute":"gene_biotype", "attribute_values":"protein_coding", "internals":0.1, "name":"protein_coding_internal"},
	{"feature":"gene", "feature.anchor":"start", "distance":[10000,1000], "name":"any_promoter"},
	{"feature":"gene", "distance":1, "internals":0.1, "name":"any_internal"},
	{"feature":"gene", "distance":[50000, 50000], "name":"distal_enhancer"},
	],
	"show_attributes":["gene_biotype", "gene_id", "gene_name"],
	"priority":"True"
	}

	config["gtf"] = input.gtf
	config["bed"] = input.bed

	string_config = json.dumps(config, indent=4)

	config_file = open(output[0], "w")
	config_file.write(string_config)
	config_file.close()

	# Peak annotation
	# peaks per condition or across conditions, dependent on run info output
	rule uropa:
	input:
	config = rules.uropa_config.output.config #os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.config")
	output:
	finalhits = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits.txt"),
	finalhits_sub = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_finalhits_sub.txt"),
	peaks = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated.bed"),
	header = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated_header.txt"),
	threads:
	99
	log:
	os.path.join(OUTPUTDIR, "logs", "uropa.log")
	params:
	prefix = os.path.join(OUTPUTDIR, "peak_annotation", "all_merged_annotated")
	conda:
	os.path.join(environments_dir, "uropa.yaml")
	shell:
	"uropa --input {input.config} --prefix {params.prefix} --threads {threads} --log {log}; "
	"cut -f 1-4,7-13,16-19 {output.finalhits} > {output.finalhits_sub}; " #Get a subset of columns
	"head -n 1 {output.finalhits_sub} > {output.header};" #header
	"tail -n +2 {output.finalhits_sub} > {output.peaks}" #bedlines