import os import shutil #GLOBAL VALUES sampleID = config["sampleID"] IDS, = glob_wildcards("RawData/"+sampleID+"/{id}.fast5") WORKDIR = os.getcwd() rule all: input: expand("Fastq/{sample}.fastq", sample=sampleID), expand("QualityCheck/{sample}/summary.yaml", sample=sampleID), sampleID + "_0.1.pdf" ################################################################################ ################################################################################ ######################################## Basecalling # basecalling using GUPPY rule Basecalling: input: "RawData/{sample}/{id}.fast5" output: "QC/{sample}/{id}.txt", "Fast5/{sample}/{id}/{id}.fast5", "Basecalled/{sample}/{id}.fastq" resources: memory = 10, time = 5 params: guppy = config["guppy"], flowcell = config["flowcell"], kit = config["kit"], work = WORKDIR priority: 10 group: "basecalling" threads: config["threads"] shell: """ mkdir -p Fast5/{wildcards.sample}/{wildcards.id} ln -sf {params.work}/{input} {params.work}/Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5 {params.guppy} -i Fast5/{wildcards.sample}/{wildcards.id}/ -s Basecalled/{wildcards.sample}/{wildcards.id}/ \ --flowcell {params.flowcell} --kit {params.kit} --num_callers 1 --cpu_threads_per_caller {threads} --fast5_out mv -f Basecalled/{wildcards.sample}/{wildcards.id}/workspace/{wildcards.id}.fast5 Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5 mv Basecalled/{wildcards.sample}/{wildcards.id}/sequencing_summary.txt QC/{wildcards.sample}/{wildcards.id}.txt find Basecalled/{wildcards.sample}/{wildcards.id} -name '*.fastq' -exec mv {{}} Basecalled/{wildcards.sample}/{wildcards.id}.fastq \; """ # aggregation of fastq files rule AggregateBasecalling: input: f5=lambda wildcards: expand("Basecalled/{{sample}}/{id}.fastq",id=IDS), txt=lambda wildcards: expand("QC/{{sample}}/{id}.txt", id=IDS) output: fq="Fastq/{sample}.fastq", txt="QualityCheck/{sample}/sequencing_summary.txt" resources: memory = 50, time = 4 priority: 9 run: with open(output.fq,'w') as fout: for fn in input.f5: with open(fn,'r') as fin: shutil.copyfileobj(fin, fout) with open(output.txt,'w') as fout: i=0 for fn in input.txt: with open(fn,'r') as fin: header=fin.readline() if i==0: fout.write(header) shutil.copyfileobj(fin, fout) i+=1 # QC rule CheckQuality: input: "QualityCheck/{sample}/sequencing_summary.txt" output: "QualityCheck/{sample}/summary.yaml" priority: 8 resources: memory = 10, time = 1 shell: "Rscript scripts/MinIONQC.R -i QualityCheck/{wildcards.sample}/ -o QualityCheck" ################################################################################ ################################################################################ ######################################## Extract Mapping regions # mapping to pre-computed reference: here GRCh38_p12 rule Minimap2_mapping: params: minimap2 = config["minimap2"], junc_bonus = config["junc_bonus"], gap_open_cost = config["gap_open_cost"], MAPQ_min = config["MAPQ"], reference = "Minimap2_Index/GRCh38_p12.fasta", junc_bed = "Minimap2_Index/junctions.bed" input: fastq = "Fastq/{sample}.fastq" output: sam = "Data/{sample}.sam", stat = "Data/{sample}.mapping.stat" resources: memory = 50, time = 4 threads: config["threads"] shell: """ {params.minimap2} -ax splice -ub -k14 --secondary=no -t {threads} --junc-bed {params.junc_bed} -O{params.gap_open_cost},32 --junc-bonus={params.junc_bonus} {params.reference} {input.fastq} > {wildcards.sample}.temp cat <(grep -P "^@" {wildcards.sample}.temp ) <(grep -P -v "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | awk "{{ if ( \$5 > {params.MAPQ_min} ) print \$0 }}" ) > {output.sam} grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l > {output.stat} grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 == "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat} grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$5 > {params.MAPQ_min} ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat} """ # Identify Mapping reads rule filterReads: input: mapping = "Data/" + sampleID + ".sam", fastq = "Fastq/" + sampleID + ".fastq" output: id = "Data/" + sampleID + ".mapped.id", fastq = "Data/" + sampleID + ".mapped.fastq" resources: memory = 50, time = 1 shell: """ grep -P -v "^@" {input.mapping} | cut -f1,2 | grep -v -P "\t4$" | cut -f1 | sort | uniq > {output.id} Rscript scripts/filterReads.r {input.fastq} {output.id} {output.fastq} """ # Identify Regions of the reads mapping the reference rule getMappingBed: input: fastq = "Data/" + sampleID + ".mapped.fastq", sam = "Data/" + sampleID + ".sam" output: length = "IGV/" + sampleID + ".len.txt", cigar = "IGV/" + sampleID + ".cigar.txt", bed = "IGV/" + sampleID + ".bed" resources: memory = 10, time = 1 shell: """ Rscript scripts/getReadLength.r {input.fastq} {output.length} paste <(grep -P -v '^@' {input.sam} | cut -f6 | grep -v '\*' | sed 's/\([MIDNSHP]\)/\\1 /g') <(grep -P -v '^@' {input.sam} | cut -f1,2,3,4 | grep -v -P '\t4\t' ) > {output.cigar} Rscript scripts/getBedFile.r {output.cigar} {output.length} {output.bed} """ ################################################################################ ################################################################################ ######################################## Extract Adapter regions # convert reads to fasta rule fastqToFasta: input: fastq = "{X}.fastq" output: fasta = "{X}.fasta" resources: memory = 10, time = 1 shell: "awk ' NR % 4 == 1 {{ print $0 ; }} NR % 4 == 2 {{print $0; }}' {input.fastq} |sed 's/@/>/g' | sed 's/ .*$//g' > {output.fasta}" #first HMMER iteration rule identifyAdapter_iteration1: params: barcodes = "deposit/barcodes.cDNA.fas", hmmer = config["hmmer"], prefix = "IdentifyAdapter/" input: fasta = "Fastq/" + sampleID + ".fasta" output: tab = "IdentifyAdapter/adapter.{X}.tab" resources: memory = 50, time = 1 threads: config["threads"] shell: """ Rscript scripts/getStockholmMSA.r {params.barcodes} {params.prefix} for msa in {params.prefix}/*.msa do {params.hmmer}hmmbuild ${{msa%.msa}}.hmm $msa {params.hmmer}hmmpress ${{msa%.msa}}.hmm done cat {params.prefix}/*.hmm > {params.prefix}adapter.hmm {params.hmmer}hmmpress {params.prefix}adapter.hmm {params.hmmer}nhmmscan --noali --notextw --max -E {wildcards.X} --cpu {threads} --tblout {output.tab} {params.prefix}adapter.hmm {input.fasta} > "IdentifyAdapter/temp" """ #second HMMER iteration rule identifyAdapter_iteration2: params: barcodes = "deposit/barcodes.cDNA.fas", hmmer = config["hmmer"], prefix = "IdentifyAdapter/{X}/" input: fasta = "Fastq/" + sampleID + ".fasta", tab = "IdentifyAdapter/adapter.{X}.tab" output: tab = "IdentifyAdapter/{X}/adapter.optimized.tab" resources: memory = 50, time = 1 threads: config["threads"] shell: """ grep -P -v "^#" {input.tab} | sed -r "s/[[:space:]]+/\t/g" | cut -f1,3,7,8,13 > {params.prefix}"firstHits" Rscript scripts/fetchFasta.r {params.prefix}"firstHits" {input.fasta} {params.prefix} {wildcards.X} for fasta in {params.prefix}*.fasta do p=${{fasta%.fasta}} name=${{p##*/}} {params.hmmer}hmmalign --trim IdentifyAdapter/${{name}}.hmm $fasta > ${{fasta%.fasta}}.optimized.msa {params.hmmer}hmmbuild ${{fasta%.fasta}}.optimized.hmm ${{fasta%.fasta}}.optimized.msa {params.hmmer}hmmpress ${{fasta%.fasta}}.optimized.hmm done cat {params.prefix}*.optimized.hmm > {params.prefix}adapter.optimized.hmm {params.hmmer}hmmpress {params.prefix}adapter.optimized.hmm {params.hmmer}nhmmscan --notextw --max -E 10 --cpu {threads} --tblout {output.tab} {params.prefix}adapter.optimized.hmm {input.fasta} > {params.prefix}"temp" """ #get adapter positions in reads rule convert2Bed: input: tab = "{X}.tab" output: bed = "{X}.bed" resources: memory = 10, time = 1 shell: """ sed -r "s/[[:space:]]+/\t/g" {input.tab} | grep -v -P "^#" | cut -f1,3,7,8,12,13 | \ awk '{{print $2"\t"$3"\t"$4"\t"$1"\t"$6"\t"$5}}' | sort -k1,1 -k2,2n | sed "s/\.optimized//g" > {wildcards.X}.temp Rscript scripts/convertEvalue2Score.r {wildcards.X}.temp {output.bed} 1 rm {wildcards.X}.temp """ ################################################################################ ################################################################################ ######################################## Classify Reads # join adapter and mapping information rule joinBed: input: adapter = "IdentifyAdapter/{X}/adapter.optimized.bed", mapping = "IGV/" + sampleID + ".bed" output: bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed" resources: memory = 10, time = 1 shell: "cat <(grep -P -v '\t0\t[\+\-]+$' {input.adapter}) {input.mapping} | sort -k1,1 -k2,2n > {output.bed}" rule mergeBed: params: overlap = 15 input: bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed" output: bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed", bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed" resources: memory = 10, time = 1 shell: """ sort -k1,1 -k2,2n {input.bed} | grep -v supplementary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_primary} sort -k1,1 -k2,2n {input.bed} | grep -v primary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_supplementary} """ # classification rule classifyReads: input: bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed", bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed", len = "IGV/" + sampleID + ".len.txt" output: classification_temp = "Classifier_{X}/" + sampleID + ".classification.temp", classification = "Classifier_{X}/" + sampleID + ".classification.txt", stat = "Classifier_{X}/" + sampleID + ".classification.stat" resources: memory = 10, time = 1 shell: """ Rscript scripts/classifyReads.r {input.bed_primary} {input.bed_supplementary} {input.len} {output.classification_temp} {output.stat} paste {output.classification_temp} <(grep -o -P "primary_.*_[0-9]+|supplementary_.*_[0-9]+" {output.classification_temp} | sed "s/\_/\t/g") | sort -k1,1n > {output.classification} """ # sort rule sortByName: input: sam = "{X}.sam", output: sam_sorted = "{X}.sortedName.sam" resources: memory = 10, time = 1 shell: """cat <(samtools view -H {input.sam}) <(grep -v -P "^@" {input.sam} | sort -k1,1n ) > {output.sam_sorted}""" # write classification into sam file rule modifySam: input: sam = "Data/" + sampleID + ".sortedName.sam", classification = "Classifier_{X}/" + sampleID + ".classification.txt" output: sam = "Classifier_{X}/" + sampleID + ".classified.sam" resources: memory = 50, time = 1 shell: "Rscript scripts/modifySam.r {input.sam} {input.classification} {output.sam}" ################################################################################ ################################################################################ ######################################## Get statistics # quantify suspicious species rule informativeReads: params: featureCounts = config["featureCounts"], maskRrna = "deposit/maskRrna.txt", maskRegions = "deposit/maskRegion.txt" input: bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam" output: tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv", tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv" resources: memory = 10, time = 1 shell: """ {params.featureCounts} -F SAF -L -a {params.maskRrna} -o {output.tsv} {input.bam} {params.featureCounts} -F SAF -L -a {params.maskRegions} -o {output.tsv2} {input.bam} """ # convert to bam rule convertIGV: input: sam = "{path}.sam" output: bam = "{path}.sorted.bam", bai = "{path}.sorted.bam.bai" resources: memory = 10, time = 1 shell: """ samtools view -b {input.sam} > {wildcards.path}.bam samtools sort {wildcards.path}.bam -o {output.bam} samtools index {output.bam} """ # masked files rule maskeBam: params: maskRrna = "deposit/maskRrna.txt", maskRegions = "deposit/maskRegion.txt" input: bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam" output: bam = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam", bai = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam.bai" resources: memory = 10, time = 1 shell: """ cat <(tail -n +2 {params.maskRrna} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}') <(tail -n +2 {params.maskRegions} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}' ) | sort -k1,1 -k2,2n > Classifier_{wildcards.X}/maskRegions.bed bedtools intersect -v -abam {input.bam} -b Classifier_{wildcards.X}/maskRegions.bed > {output.bam} samtools index {output.bam} """ # get stats rule compareMinimap2: input: sam = "Classifier_{X}/" + sampleID + ".classified.sam" output: stat = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat" resources: memory = 10, time = 1 shell: """ paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f1) <(paste <(paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f2) <(grep -P -o "ts:A:[+-]+" {input.sam} )) <(grep -P "ts:A:[+-]+" {input.sam} | grep -o -P "ST:A:[+-\.]+")) > "Classifier_{wildcards.X}/temp" Rscript scripts/compareMinimapResults.r "Classifier_{wildcards.X}/temp" {output.stat} """ # get stats rule getStats: input: tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv", tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv", stat = "Classifier_{X}/" + sampleID + ".classification.stat", stat2 = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat" output: pdf = "" + sampleID + "_{X}.pdf" resources: memory = 10, time = 1 shell: """ cat <(cat <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv}.summary | awk '{{sum+=$2}} END {{print sum}}') <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv2}.summary | awk '{{sum+=$2}} END {{print sum}}')) \ <(awk '{{sum+=$2}} END {{print sum}}' {input.tsv2}.summary) > Classifier_{wildcards.X}/stat.txt Rscript scripts/plotStatistics.r Classifier_{wildcards.X}/stat.txt {input.stat} {input.stat2} {output.pdf} """ #################################################################################