Skip to content
Permalink
e9ffd2640e
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 425 lines (393 sloc) 15.1 KB
import os
import shutil
#GLOBAL VALUES
sampleID = config["sampleID"]
IDS, = glob_wildcards("RawData/"+sampleID+"/{id}.fast5")
WORKDIR = os.getcwd()
rule all:
input:
expand("Fastq/{sample}.fastq", sample=sampleID),
expand("QualityCheck/{sample}/summary.yaml", sample=sampleID),
sampleID + "_0.1.pdf"
################################################################################
################################################################################
######################################## Basecalling
# basecalling using GUPPY
rule Basecalling:
input:
"RawData/{sample}/{id}.fast5"
output:
"QC/{sample}/{id}.txt",
"Fast5/{sample}/{id}/{id}.fast5",
"Basecalled/{sample}/{id}.fastq"
resources:
memory = 10,
time = 5
params:
guppy = config["guppy"],
flowcell = config["flowcell"],
kit = config["kit"],
work = WORKDIR
priority: 10
group: "basecalling"
threads: config["threads"]
shell:
"""
mkdir -p Fast5/{wildcards.sample}/{wildcards.id}
ln -sf {params.work}/{input} {params.work}/Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5
{params.guppy} -i Fast5/{wildcards.sample}/{wildcards.id}/ -s Basecalled/{wildcards.sample}/{wildcards.id}/ \
--flowcell {params.flowcell} --kit {params.kit} --num_callers 1 --cpu_threads_per_caller {threads} --fast5_out
mv -f Basecalled/{wildcards.sample}/{wildcards.id}/workspace/{wildcards.id}.fast5 Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5
mv Basecalled/{wildcards.sample}/{wildcards.id}/sequencing_summary.txt QC/{wildcards.sample}/{wildcards.id}.txt
find Basecalled/{wildcards.sample}/{wildcards.id} -name '*.fastq' -exec mv {{}} Basecalled/{wildcards.sample}/{wildcards.id}.fastq \;
"""
# aggregation of fastq files
rule AggregateBasecalling:
input:
f5=lambda wildcards: expand("Basecalled/{{sample}}/{id}.fastq",id=IDS),
txt=lambda wildcards: expand("QC/{{sample}}/{id}.txt", id=IDS)
output:
fq="Fastq/{sample}.fastq",
txt="QualityCheck/{sample}/sequencing_summary.txt"
resources:
memory = 50,
time = 4
priority: 9
run:
with open(output.fq,'w') as fout:
for fn in input.f5:
with open(fn,'r') as fin:
shutil.copyfileobj(fin, fout)
with open(output.txt,'w') as fout:
i=0
for fn in input.txt:
with open(fn,'r') as fin:
header=fin.readline()
if i==0:
fout.write(header)
shutil.copyfileobj(fin, fout)
i+=1
# QC
rule CheckQuality:
input:
"QualityCheck/{sample}/sequencing_summary.txt"
output:
"QualityCheck/{sample}/summary.yaml"
priority: 8
resources:
memory = 10,
time = 1
shell:
"Rscript scripts/MinIONQC.R -i QualityCheck/{wildcards.sample}/ -o QualityCheck"
################################################################################
################################################################################
######################################## Extract Mapping regions
# mapping to pre-computed reference: here GRCh38_p12
rule Minimap2_mapping:
params:
minimap2 = config["minimap2"],
junc_bonus = config["junc_bonus"],
gap_open_cost = config["gap_open_cost"],
MAPQ_min = config["MAPQ"],
reference = "Minimap2_Index/GRCh38_p12.fasta",
junc_bed = "Minimap2_Index/junctions.bed"
input:
fastq = "Fastq/{sample}.fastq"
output:
sam = "Data/{sample}.sam",
stat = "Data/{sample}.mapping.stat"
resources:
memory = 50,
time = 4
threads: config["threads"]
shell:
"""
{params.minimap2} -ax splice -ub -k14 --secondary=no -t {threads} --junc-bed {params.junc_bed} -O{params.gap_open_cost},32 --junc-bonus={params.junc_bonus} {params.reference} {input.fastq} > {wildcards.sample}.temp
cat <(grep -P "^@" {wildcards.sample}.temp ) <(grep -P -v "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | awk "{{ if ( \$5 > {params.MAPQ_min} ) print \$0 }}" ) > {output.sam}
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l > {output.stat}
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 == "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat}
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$5 > {params.MAPQ_min} ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat}
"""
# Identify Mapping reads
rule filterReads:
input:
mapping = "Data/" + sampleID + ".sam",
fastq = "Fastq/" + sampleID + ".fastq"
output:
id = "Data/" + sampleID + ".mapped.id",
fastq = "Data/" + sampleID + ".mapped.fastq"
resources:
memory = 50,
time = 1
shell:
"""
grep -P -v "^@" {input.mapping} | cut -f1,2 | grep -v -P "\t4$" | cut -f1 | sort | uniq > {output.id}
Rscript scripts/filterReads.r {input.fastq} {output.id} {output.fastq}
"""
# Identify Regions of the reads mapping the reference
rule getMappingBed:
input:
fastq = "Data/" + sampleID + ".mapped.fastq",
sam = "Data/" + sampleID + ".sam"
output:
length = "IGV/" + sampleID + ".len.txt",
cigar = "IGV/" + sampleID + ".cigar.txt",
bed = "IGV/" + sampleID + ".bed"
resources:
memory = 10,
time = 1
shell:
"""
Rscript scripts/getReadLength.r {input.fastq} {output.length}
paste <(grep -P -v '^@' {input.sam} | cut -f6 | grep -v '\*' | sed 's/\([MIDNSHP]\)/\\1 /g') <(grep -P -v '^@' {input.sam} | cut -f1,2,3,4 | grep -v -P '\t4\t' ) > {output.cigar}
Rscript scripts/getBedFile.r {output.cigar} {output.length} {output.bed}
"""
################################################################################
################################################################################
######################################## Extract Adapter regions
# convert reads to fasta
rule fastqToFasta:
input:
fastq = "{X}.fastq"
output:
fasta = "{X}.fasta"
resources:
memory = 10,
time = 1
shell:
"awk ' NR % 4 == 1 {{ print $0 ; }} NR % 4 == 2 {{print $0; }}' {input.fastq} |sed 's/@/>/g' | sed 's/ .*$//g' > {output.fasta}"
#first HMMER iteration
rule identifyAdapter_iteration1:
params:
barcodes = "deposit/barcodes.cDNA.fas",
hmmer = config["hmmer"],
prefix = "IdentifyAdapter/"
input:
fasta = "Fastq/" + sampleID + ".fasta"
output:
tab = "IdentifyAdapter/adapter.{X}.tab"
resources:
memory = 50,
time = 1
threads: config["threads"]
shell:
"""
Rscript scripts/getStockholmMSA.r {params.barcodes} {params.prefix}
for msa in {params.prefix}/*.msa
do
{params.hmmer}hmmbuild ${{msa%.msa}}.hmm $msa
{params.hmmer}hmmpress ${{msa%.msa}}.hmm
done
cat {params.prefix}/*.hmm > {params.prefix}adapter.hmm
{params.hmmer}hmmpress {params.prefix}adapter.hmm
{params.hmmer}nhmmscan --noali --notextw --max -E {wildcards.X} --cpu {threads} --tblout {output.tab} {params.prefix}adapter.hmm {input.fasta} > "IdentifyAdapter/temp"
"""
#second HMMER iteration
rule identifyAdapter_iteration2:
params:
barcodes = "deposit/barcodes.cDNA.fas",
hmmer = config["hmmer"],
prefix = "IdentifyAdapter/{X}/"
input:
fasta = "Fastq/" + sampleID + ".fasta",
tab = "IdentifyAdapter/adapter.{X}.tab"
output:
tab = "IdentifyAdapter/{X}/adapter.optimized.tab"
resources:
memory = 50,
time = 1
threads: config["threads"]
shell:
"""
grep -P -v "^#" {input.tab} | sed -r "s/[[:space:]]+/\t/g" | cut -f1,3,7,8,13 > {params.prefix}"firstHits"
Rscript scripts/fetchFasta.r {params.prefix}"firstHits" {input.fasta} {params.prefix} {wildcards.X}
for fasta in {params.prefix}*.fasta
do
p=${{fasta%.fasta}}
name=${{p##*/}}
{params.hmmer}hmmalign --trim IdentifyAdapter/${{name}}.hmm $fasta > ${{fasta%.fasta}}.optimized.msa
{params.hmmer}hmmbuild ${{fasta%.fasta}}.optimized.hmm ${{fasta%.fasta}}.optimized.msa
{params.hmmer}hmmpress ${{fasta%.fasta}}.optimized.hmm
done
cat {params.prefix}*.optimized.hmm > {params.prefix}adapter.optimized.hmm
{params.hmmer}hmmpress {params.prefix}adapter.optimized.hmm
{params.hmmer}nhmmscan --notextw --max -E 10 --cpu {threads} --tblout {output.tab} {params.prefix}adapter.optimized.hmm {input.fasta} > {params.prefix}"temp"
"""
#get adapter positions in reads
rule convert2Bed:
input:
tab = "{X}.tab"
output:
bed = "{X}.bed"
resources:
memory = 10,
time = 1
shell:
"""
sed -r "s/[[:space:]]+/\t/g" {input.tab} | grep -v -P "^#" | cut -f1,3,7,8,12,13 | \
awk '{{print $2"\t"$3"\t"$4"\t"$1"\t"$6"\t"$5}}' | sort -k1,1 -k2,2n | sed "s/\.optimized//g" > {wildcards.X}.temp
Rscript scripts/convertEvalue2Score.r {wildcards.X}.temp {output.bed} 1
rm {wildcards.X}.temp
"""
################################################################################
################################################################################
######################################## Classify Reads
# join adapter and mapping information
rule joinBed:
input:
adapter = "IdentifyAdapter/{X}/adapter.optimized.bed",
mapping = "IGV/" + sampleID + ".bed"
output:
bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed"
resources:
memory = 10,
time = 1
shell:
"cat <(grep -P -v '\t0\t[\+\-]+$' {input.adapter}) {input.mapping} | sort -k1,1 -k2,2n > {output.bed}"
rule mergeBed:
params:
overlap = 15
input:
bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed"
output:
bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed",
bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed"
resources:
memory = 10,
time = 1
shell:
"""
sort -k1,1 -k2,2n {input.bed} | grep -v supplementary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_primary}
sort -k1,1 -k2,2n {input.bed} | grep -v primary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_supplementary}
"""
# classification
rule classifyReads:
input:
bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed",
bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed",
len = "IGV/" + sampleID + ".len.txt"
output:
classification_temp = "Classifier_{X}/" + sampleID + ".classification.temp",
classification = "Classifier_{X}/" + sampleID + ".classification.txt",
stat = "Classifier_{X}/" + sampleID + ".classification.stat"
resources:
memory = 10,
time = 1
shell:
"""
Rscript scripts/classifyReads.r {input.bed_primary} {input.bed_supplementary} {input.len} {output.classification_temp} {output.stat}
paste {output.classification_temp} <(grep -o -P "primary_.*_[0-9]+|supplementary_.*_[0-9]+" {output.classification_temp} | sed "s/\_/\t/g") | sort -k1,1n > {output.classification}
"""
# sort
rule sortByName:
input:
sam = "{X}.sam",
output:
sam_sorted = "{X}.sortedName.sam"
resources:
memory = 10,
time = 1
shell:
"""cat <(samtools view -H {input.sam}) <(grep -v -P "^@" {input.sam} | sort -k1,1n ) > {output.sam_sorted}"""
# write classification into sam file
rule modifySam:
input:
sam = "Data/" + sampleID + ".sortedName.sam",
classification = "Classifier_{X}/" + sampleID + ".classification.txt"
output:
sam = "Classifier_{X}/" + sampleID + ".classified.sam"
resources:
memory = 50,
time = 1
shell:
"Rscript scripts/modifySam.r {input.sam} {input.classification} {output.sam}"
################################################################################
################################################################################
######################################## Get statistics
# quantify suspicious species
rule informativeReads:
params:
featureCounts = config["featureCounts"],
maskRrna = "deposit/maskRrna.txt",
maskRegions = "deposit/maskRegion.txt"
input:
bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam"
output:
tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv",
tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv"
resources:
memory = 10,
time = 1
shell:
"""
{params.featureCounts} -F SAF -L -a {params.maskRrna} -o {output.tsv} {input.bam}
{params.featureCounts} -F SAF -L -a {params.maskRegions} -o {output.tsv2} {input.bam}
"""
# convert to bam
rule convertIGV:
input:
sam = "{path}.sam"
output:
bam = "{path}.sorted.bam",
bai = "{path}.sorted.bam.bai"
resources:
memory = 10,
time = 1
shell:
"""
samtools view -b {input.sam} > {wildcards.path}.bam
samtools sort {wildcards.path}.bam -o {output.bam}
samtools index {output.bam}
"""
# masked files
rule maskeBam:
params:
maskRrna = "deposit/maskRrna.txt",
maskRegions = "deposit/maskRegion.txt"
input:
bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam"
output:
bam = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam",
bai = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam.bai"
resources:
memory = 10,
time = 1
shell:
"""
cat <(tail -n +2 {params.maskRrna} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}') <(tail -n +2 {params.maskRegions} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}' ) | sort -k1,1 -k2,2n > Classifier_{wildcards.X}/maskRegions.bed
bedtools intersect -v -abam {input.bam} -b Classifier_{wildcards.X}/maskRegions.bed > {output.bam}
samtools index {output.bam}
"""
# get stats
rule compareMinimap2:
input:
sam = "Classifier_{X}/" + sampleID + ".classified.sam"
output:
stat = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat"
resources:
memory = 10,
time = 1
shell:
"""
paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f1) <(paste <(paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f2) <(grep -P -o "ts:A:[+-]+" {input.sam} )) <(grep -P "ts:A:[+-]+" {input.sam} | grep -o -P "ST:A:[+-\.]+")) > "Classifier_{wildcards.X}/temp"
Rscript scripts/compareMinimapResults.r "Classifier_{wildcards.X}/temp" {output.stat}
"""
# get stats
rule getStats:
input:
tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv",
tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv",
stat = "Classifier_{X}/" + sampleID + ".classification.stat",
stat2 = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat"
output:
pdf = "" + sampleID + "_{X}.pdf"
resources:
memory = 10,
time = 1
shell:
"""
cat <(cat <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv}.summary | awk '{{sum+=$2}} END {{print sum}}') <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv2}.summary | awk '{{sum+=$2}} END {{print sum}}')) \
<(awk '{{sum+=$2}} END {{print sum}}' {input.tsv2}.summary) > Classifier_{wildcards.X}/stat.txt
Rscript scripts/plotStatistics.r Classifier_{wildcards.X}/stat.txt {input.stat} {input.stat2} {output.pdf}
"""
#################################################################################