Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
nascONT-seq-pipeline/Snakefile
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
425 lines (393 sloc)
15.1 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import shutil | |
#GLOBAL VALUES | |
sampleID = config["sampleID"] | |
IDS, = glob_wildcards("RawData/"+sampleID+"/{id}.fast5") | |
WORKDIR = os.getcwd() | |
rule all: | |
input: | |
expand("Fastq/{sample}.fastq", sample=sampleID), | |
expand("QualityCheck/{sample}/summary.yaml", sample=sampleID), | |
sampleID + "_0.1.pdf" | |
################################################################################ | |
################################################################################ | |
######################################## Basecalling | |
# basecalling using GUPPY | |
rule Basecalling: | |
input: | |
"RawData/{sample}/{id}.fast5" | |
output: | |
"QC/{sample}/{id}.txt", | |
"Fast5/{sample}/{id}/{id}.fast5", | |
"Basecalled/{sample}/{id}.fastq" | |
resources: | |
memory = 10, | |
time = 5 | |
params: | |
guppy = config["guppy"], | |
flowcell = config["flowcell"], | |
kit = config["kit"], | |
work = WORKDIR | |
priority: 10 | |
group: "basecalling" | |
threads: config["threads"] | |
shell: | |
""" | |
mkdir -p Fast5/{wildcards.sample}/{wildcards.id} | |
ln -sf {params.work}/{input} {params.work}/Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5 | |
{params.guppy} -i Fast5/{wildcards.sample}/{wildcards.id}/ -s Basecalled/{wildcards.sample}/{wildcards.id}/ \ | |
--flowcell {params.flowcell} --kit {params.kit} --num_callers 1 --cpu_threads_per_caller {threads} --fast5_out | |
mv -f Basecalled/{wildcards.sample}/{wildcards.id}/workspace/{wildcards.id}.fast5 Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5 | |
mv Basecalled/{wildcards.sample}/{wildcards.id}/sequencing_summary.txt QC/{wildcards.sample}/{wildcards.id}.txt | |
find Basecalled/{wildcards.sample}/{wildcards.id} -name '*.fastq' -exec mv {{}} Basecalled/{wildcards.sample}/{wildcards.id}.fastq \; | |
""" | |
# aggregation of fastq files | |
rule AggregateBasecalling: | |
input: | |
f5=lambda wildcards: expand("Basecalled/{{sample}}/{id}.fastq",id=IDS), | |
txt=lambda wildcards: expand("QC/{{sample}}/{id}.txt", id=IDS) | |
output: | |
fq="Fastq/{sample}.fastq", | |
txt="QualityCheck/{sample}/sequencing_summary.txt" | |
resources: | |
memory = 50, | |
time = 4 | |
priority: 9 | |
run: | |
with open(output.fq,'w') as fout: | |
for fn in input.f5: | |
with open(fn,'r') as fin: | |
shutil.copyfileobj(fin, fout) | |
with open(output.txt,'w') as fout: | |
i=0 | |
for fn in input.txt: | |
with open(fn,'r') as fin: | |
header=fin.readline() | |
if i==0: | |
fout.write(header) | |
shutil.copyfileobj(fin, fout) | |
i+=1 | |
# QC | |
rule CheckQuality: | |
input: | |
"QualityCheck/{sample}/sequencing_summary.txt" | |
output: | |
"QualityCheck/{sample}/summary.yaml" | |
priority: 8 | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
"Rscript scripts/MinIONQC.R -i QualityCheck/{wildcards.sample}/ -o QualityCheck" | |
################################################################################ | |
################################################################################ | |
######################################## Extract Mapping regions | |
# mapping to pre-computed reference: here GRCh38_p12 | |
rule Minimap2_mapping: | |
params: | |
minimap2 = config["minimap2"], | |
junc_bonus = config["junc_bonus"], | |
gap_open_cost = config["gap_open_cost"], | |
MAPQ_min = config["MAPQ"], | |
reference = "Minimap2_Index/GRCh38_p12.fasta", | |
junc_bed = "Minimap2_Index/junctions.bed" | |
input: | |
fastq = "Fastq/{sample}.fastq" | |
output: | |
sam = "Data/{sample}.sam", | |
stat = "Data/{sample}.mapping.stat" | |
resources: | |
memory = 50, | |
time = 4 | |
threads: config["threads"] | |
shell: | |
""" | |
{params.minimap2} -ax splice -ub -k14 --secondary=no -t {threads} --junc-bed {params.junc_bed} -O{params.gap_open_cost},32 --junc-bonus={params.junc_bonus} {params.reference} {input.fastq} > {wildcards.sample}.temp | |
cat <(grep -P "^@" {wildcards.sample}.temp ) <(grep -P -v "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | awk "{{ if ( \$5 > {params.MAPQ_min} ) print \$0 }}" ) > {output.sam} | |
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l > {output.stat} | |
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 == "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat} | |
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$5 > {params.MAPQ_min} ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat} | |
""" | |
# Identify Mapping reads | |
rule filterReads: | |
input: | |
mapping = "Data/" + sampleID + ".sam", | |
fastq = "Fastq/" + sampleID + ".fastq" | |
output: | |
id = "Data/" + sampleID + ".mapped.id", | |
fastq = "Data/" + sampleID + ".mapped.fastq" | |
resources: | |
memory = 50, | |
time = 1 | |
shell: | |
""" | |
grep -P -v "^@" {input.mapping} | cut -f1,2 | grep -v -P "\t4$" | cut -f1 | sort | uniq > {output.id} | |
Rscript scripts/filterReads.r {input.fastq} {output.id} {output.fastq} | |
""" | |
# Identify Regions of the reads mapping the reference | |
rule getMappingBed: | |
input: | |
fastq = "Data/" + sampleID + ".mapped.fastq", | |
sam = "Data/" + sampleID + ".sam" | |
output: | |
length = "IGV/" + sampleID + ".len.txt", | |
cigar = "IGV/" + sampleID + ".cigar.txt", | |
bed = "IGV/" + sampleID + ".bed" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
Rscript scripts/getReadLength.r {input.fastq} {output.length} | |
paste <(grep -P -v '^@' {input.sam} | cut -f6 | grep -v '\*' | sed 's/\([MIDNSHP]\)/\\1 /g') <(grep -P -v '^@' {input.sam} | cut -f1,2,3,4 | grep -v -P '\t4\t' ) > {output.cigar} | |
Rscript scripts/getBedFile.r {output.cigar} {output.length} {output.bed} | |
""" | |
################################################################################ | |
################################################################################ | |
######################################## Extract Adapter regions | |
# convert reads to fasta | |
rule fastqToFasta: | |
input: | |
fastq = "{X}.fastq" | |
output: | |
fasta = "{X}.fasta" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
"awk ' NR % 4 == 1 {{ print $0 ; }} NR % 4 == 2 {{print $0; }}' {input.fastq} |sed 's/@/>/g' | sed 's/ .*$//g' > {output.fasta}" | |
#first HMMER iteration | |
rule identifyAdapter_iteration1: | |
params: | |
barcodes = "deposit/barcodes.cDNA.fas", | |
hmmer = config["hmmer"], | |
prefix = "IdentifyAdapter/" | |
input: | |
fasta = "Fastq/" + sampleID + ".fasta" | |
output: | |
tab = "IdentifyAdapter/adapter.{X}.tab" | |
resources: | |
memory = 50, | |
time = 1 | |
threads: config["threads"] | |
shell: | |
""" | |
Rscript scripts/getStockholmMSA.r {params.barcodes} {params.prefix} | |
for msa in {params.prefix}/*.msa | |
do | |
{params.hmmer}hmmbuild ${{msa%.msa}}.hmm $msa | |
{params.hmmer}hmmpress ${{msa%.msa}}.hmm | |
done | |
cat {params.prefix}/*.hmm > {params.prefix}adapter.hmm | |
{params.hmmer}hmmpress {params.prefix}adapter.hmm | |
{params.hmmer}nhmmscan --noali --notextw --max -E {wildcards.X} --cpu {threads} --tblout {output.tab} {params.prefix}adapter.hmm {input.fasta} > "IdentifyAdapter/temp" | |
""" | |
#second HMMER iteration | |
rule identifyAdapter_iteration2: | |
params: | |
barcodes = "deposit/barcodes.cDNA.fas", | |
hmmer = config["hmmer"], | |
prefix = "IdentifyAdapter/{X}/" | |
input: | |
fasta = "Fastq/" + sampleID + ".fasta", | |
tab = "IdentifyAdapter/adapter.{X}.tab" | |
output: | |
tab = "IdentifyAdapter/{X}/adapter.optimized.tab" | |
resources: | |
memory = 50, | |
time = 1 | |
threads: config["threads"] | |
shell: | |
""" | |
grep -P -v "^#" {input.tab} | sed -r "s/[[:space:]]+/\t/g" | cut -f1,3,7,8,13 > {params.prefix}"firstHits" | |
Rscript scripts/fetchFasta.r {params.prefix}"firstHits" {input.fasta} {params.prefix} {wildcards.X} | |
for fasta in {params.prefix}*.fasta | |
do | |
p=${{fasta%.fasta}} | |
name=${{p##*/}} | |
{params.hmmer}hmmalign --trim IdentifyAdapter/${{name}}.hmm $fasta > ${{fasta%.fasta}}.optimized.msa | |
{params.hmmer}hmmbuild ${{fasta%.fasta}}.optimized.hmm ${{fasta%.fasta}}.optimized.msa | |
{params.hmmer}hmmpress ${{fasta%.fasta}}.optimized.hmm | |
done | |
cat {params.prefix}*.optimized.hmm > {params.prefix}adapter.optimized.hmm | |
{params.hmmer}hmmpress {params.prefix}adapter.optimized.hmm | |
{params.hmmer}nhmmscan --notextw --max -E 10 --cpu {threads} --tblout {output.tab} {params.prefix}adapter.optimized.hmm {input.fasta} > {params.prefix}"temp" | |
""" | |
#get adapter positions in reads | |
rule convert2Bed: | |
input: | |
tab = "{X}.tab" | |
output: | |
bed = "{X}.bed" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
sed -r "s/[[:space:]]+/\t/g" {input.tab} | grep -v -P "^#" | cut -f1,3,7,8,12,13 | \ | |
awk '{{print $2"\t"$3"\t"$4"\t"$1"\t"$6"\t"$5}}' | sort -k1,1 -k2,2n | sed "s/\.optimized//g" > {wildcards.X}.temp | |
Rscript scripts/convertEvalue2Score.r {wildcards.X}.temp {output.bed} 1 | |
rm {wildcards.X}.temp | |
""" | |
################################################################################ | |
################################################################################ | |
######################################## Classify Reads | |
# join adapter and mapping information | |
rule joinBed: | |
input: | |
adapter = "IdentifyAdapter/{X}/adapter.optimized.bed", | |
mapping = "IGV/" + sampleID + ".bed" | |
output: | |
bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
"cat <(grep -P -v '\t0\t[\+\-]+$' {input.adapter}) {input.mapping} | sort -k1,1 -k2,2n > {output.bed}" | |
rule mergeBed: | |
params: | |
overlap = 15 | |
input: | |
bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed" | |
output: | |
bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed", | |
bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
sort -k1,1 -k2,2n {input.bed} | grep -v supplementary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_primary} | |
sort -k1,1 -k2,2n {input.bed} | grep -v primary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_supplementary} | |
""" | |
# classification | |
rule classifyReads: | |
input: | |
bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed", | |
bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed", | |
len = "IGV/" + sampleID + ".len.txt" | |
output: | |
classification_temp = "Classifier_{X}/" + sampleID + ".classification.temp", | |
classification = "Classifier_{X}/" + sampleID + ".classification.txt", | |
stat = "Classifier_{X}/" + sampleID + ".classification.stat" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
Rscript scripts/classifyReads.r {input.bed_primary} {input.bed_supplementary} {input.len} {output.classification_temp} {output.stat} | |
paste {output.classification_temp} <(grep -o -P "primary_.*_[0-9]+|supplementary_.*_[0-9]+" {output.classification_temp} | sed "s/\_/\t/g") | sort -k1,1n > {output.classification} | |
""" | |
# sort | |
rule sortByName: | |
input: | |
sam = "{X}.sam", | |
output: | |
sam_sorted = "{X}.sortedName.sam" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
"""cat <(samtools view -H {input.sam}) <(grep -v -P "^@" {input.sam} | sort -k1,1n ) > {output.sam_sorted}""" | |
# write classification into sam file | |
rule modifySam: | |
input: | |
sam = "Data/" + sampleID + ".sortedName.sam", | |
classification = "Classifier_{X}/" + sampleID + ".classification.txt" | |
output: | |
sam = "Classifier_{X}/" + sampleID + ".classified.sam" | |
resources: | |
memory = 50, | |
time = 1 | |
shell: | |
"Rscript scripts/modifySam.r {input.sam} {input.classification} {output.sam}" | |
################################################################################ | |
################################################################################ | |
######################################## Get statistics | |
# quantify suspicious species | |
rule informativeReads: | |
params: | |
featureCounts = config["featureCounts"], | |
maskRrna = "deposit/maskRrna.txt", | |
maskRegions = "deposit/maskRegion.txt" | |
input: | |
bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam" | |
output: | |
tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv", | |
tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
{params.featureCounts} -F SAF -L -a {params.maskRrna} -o {output.tsv} {input.bam} | |
{params.featureCounts} -F SAF -L -a {params.maskRegions} -o {output.tsv2} {input.bam} | |
""" | |
# convert to bam | |
rule convertIGV: | |
input: | |
sam = "{path}.sam" | |
output: | |
bam = "{path}.sorted.bam", | |
bai = "{path}.sorted.bam.bai" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
samtools view -b {input.sam} > {wildcards.path}.bam | |
samtools sort {wildcards.path}.bam -o {output.bam} | |
samtools index {output.bam} | |
""" | |
# masked files | |
rule maskeBam: | |
params: | |
maskRrna = "deposit/maskRrna.txt", | |
maskRegions = "deposit/maskRegion.txt" | |
input: | |
bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam" | |
output: | |
bam = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam", | |
bai = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam.bai" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
cat <(tail -n +2 {params.maskRrna} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}') <(tail -n +2 {params.maskRegions} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}' ) | sort -k1,1 -k2,2n > Classifier_{wildcards.X}/maskRegions.bed | |
bedtools intersect -v -abam {input.bam} -b Classifier_{wildcards.X}/maskRegions.bed > {output.bam} | |
samtools index {output.bam} | |
""" | |
# get stats | |
rule compareMinimap2: | |
input: | |
sam = "Classifier_{X}/" + sampleID + ".classified.sam" | |
output: | |
stat = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f1) <(paste <(paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f2) <(grep -P -o "ts:A:[+-]+" {input.sam} )) <(grep -P "ts:A:[+-]+" {input.sam} | grep -o -P "ST:A:[+-\.]+")) > "Classifier_{wildcards.X}/temp" | |
Rscript scripts/compareMinimapResults.r "Classifier_{wildcards.X}/temp" {output.stat} | |
""" | |
# get stats | |
rule getStats: | |
input: | |
tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv", | |
tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv", | |
stat = "Classifier_{X}/" + sampleID + ".classification.stat", | |
stat2 = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat" | |
output: | |
pdf = "" + sampleID + "_{X}.pdf" | |
resources: | |
memory = 10, | |
time = 1 | |
shell: | |
""" | |
cat <(cat <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv}.summary | awk '{{sum+=$2}} END {{print sum}}') <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv2}.summary | awk '{{sum+=$2}} END {{print sum}}')) \ | |
<(awk '{{sum+=$2}} END {{print sum}}' {input.tsv2}.summary) > Classifier_{wildcards.X}/stat.txt | |
Rscript scripts/plotStatistics.r Classifier_{wildcards.X}/stat.txt {input.stat} {input.stat2} {output.pdf} | |
""" | |
################################################################################# |