Skip to content
Permalink
Newer
Older
100755 425 lines (393 sloc) 15.1 KB
September 22, 2020 22:21
1
import os
2
import shutil
3
4
#GLOBAL VALUES
5
sampleID = config["sampleID"]
6
IDS, = glob_wildcards("RawData/"+sampleID+"/{id}.fast5")
7
WORKDIR = os.getcwd()
8
9
rule all:
10
input:
11
expand("Fastq/{sample}.fastq", sample=sampleID),
12
expand("QualityCheck/{sample}/summary.yaml", sample=sampleID),
13
sampleID + "_0.1.pdf"
14
15
################################################################################
16
################################################################################
17
######################################## Basecalling
18
# basecalling using GUPPY
19
rule Basecalling:
20
input:
21
"RawData/{sample}/{id}.fast5"
22
output:
23
"QC/{sample}/{id}.txt",
24
"Fast5/{sample}/{id}/{id}.fast5",
25
"Basecalled/{sample}/{id}.fastq"
26
resources:
27
memory = 10,
28
time = 5
29
params:
30
guppy = config["guppy"],
31
flowcell = config["flowcell"],
32
kit = config["kit"],
33
work = WORKDIR
34
priority: 10
35
group: "basecalling"
36
threads: config["threads"]
37
shell:
38
"""
39
mkdir -p Fast5/{wildcards.sample}/{wildcards.id}
40
41
ln -sf {params.work}/{input} {params.work}/Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5
42
43
{params.guppy} -i Fast5/{wildcards.sample}/{wildcards.id}/ -s Basecalled/{wildcards.sample}/{wildcards.id}/ \
44
--flowcell {params.flowcell} --kit {params.kit} --num_callers 1 --cpu_threads_per_caller {threads} --fast5_out
45
46
mv -f Basecalled/{wildcards.sample}/{wildcards.id}/workspace/{wildcards.id}.fast5 Fast5/{wildcards.sample}/{wildcards.id}/{wildcards.id}.fast5
47
48
mv Basecalled/{wildcards.sample}/{wildcards.id}/sequencing_summary.txt QC/{wildcards.sample}/{wildcards.id}.txt
49
50
find Basecalled/{wildcards.sample}/{wildcards.id} -name '*.fastq' -exec mv {{}} Basecalled/{wildcards.sample}/{wildcards.id}.fastq \;
51
"""
52
53
# aggregation of fastq files
54
rule AggregateBasecalling:
55
input:
56
f5=lambda wildcards: expand("Basecalled/{{sample}}/{id}.fastq",id=IDS),
57
txt=lambda wildcards: expand("QC/{{sample}}/{id}.txt", id=IDS)
58
output:
59
fq="Fastq/{sample}.fastq",
60
txt="QualityCheck/{sample}/sequencing_summary.txt"
61
resources:
62
memory = 50,
63
time = 4
64
priority: 9
65
run:
66
with open(output.fq,'w') as fout:
67
for fn in input.f5:
68
with open(fn,'r') as fin:
69
shutil.copyfileobj(fin, fout)
70
with open(output.txt,'w') as fout:
71
i=0
72
for fn in input.txt:
73
with open(fn,'r') as fin:
74
header=fin.readline()
75
if i==0:
76
fout.write(header)
77
shutil.copyfileobj(fin, fout)
78
i+=1
79
80
# QC
81
rule CheckQuality:
82
input:
83
"QualityCheck/{sample}/sequencing_summary.txt"
84
output:
85
"QualityCheck/{sample}/summary.yaml"
86
priority: 8
87
resources:
88
memory = 10,
89
time = 1
90
shell:
91
"Rscript scripts/MinIONQC.R -i QualityCheck/{wildcards.sample}/ -o QualityCheck"
92
93
################################################################################
94
################################################################################
95
######################################## Extract Mapping regions
96
# mapping to pre-computed reference: here GRCh38_p12
97
rule Minimap2_mapping:
98
params:
99
minimap2 = config["minimap2"],
100
junc_bonus = config["junc_bonus"],
101
gap_open_cost = config["gap_open_cost"],
102
MAPQ_min = config["MAPQ"],
103
reference = "Minimap2_Index/GRCh38_p12.fasta",
104
junc_bed = "Minimap2_Index/junctions.bed"
105
input:
106
fastq = "Fastq/{sample}.fastq"
107
output:
108
sam = "Data/{sample}.sam",
109
stat = "Data/{sample}.mapping.stat"
110
resources:
111
memory = 50,
112
time = 4
113
threads: config["threads"]
114
shell:
115
"""
116
{params.minimap2} -ax splice -ub -k14 --secondary=no -t {threads} --junc-bed {params.junc_bed} -O{params.gap_open_cost},32 --junc-bonus={params.junc_bonus} {params.reference} {input.fastq} > {wildcards.sample}.temp
117
cat <(grep -P "^@" {wildcards.sample}.temp ) <(grep -P -v "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | awk "{{ if ( \$5 > {params.MAPQ_min} ) print \$0 }}" ) > {output.sam}
118
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 != "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l > {output.stat}
119
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$2 == "4" ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat}
120
grep -v -P "^@" {wildcards.sample}.temp | awk "{{ if (\$5 > {params.MAPQ_min} ) print \$0 }}" | cut -f1 | sort | uniq | wc -l >> {output.stat}
121
"""
122
123
# Identify Mapping reads
124
rule filterReads:
125
input:
126
mapping = "Data/" + sampleID + ".sam",
127
fastq = "Fastq/" + sampleID + ".fastq"
128
output:
129
id = "Data/" + sampleID + ".mapped.id",
130
fastq = "Data/" + sampleID + ".mapped.fastq"
131
resources:
132
memory = 50,
133
time = 1
134
shell:
135
"""
136
grep -P -v "^@" {input.mapping} | cut -f1,2 | grep -v -P "\t4$" | cut -f1 | sort | uniq > {output.id}
137
Rscript scripts/filterReads.r {input.fastq} {output.id} {output.fastq}
138
"""
139
140
# Identify Regions of the reads mapping the reference
141
rule getMappingBed:
142
input:
143
fastq = "Data/" + sampleID + ".mapped.fastq",
144
sam = "Data/" + sampleID + ".sam"
145
output:
146
length = "IGV/" + sampleID + ".len.txt",
147
cigar = "IGV/" + sampleID + ".cigar.txt",
148
bed = "IGV/" + sampleID + ".bed"
149
resources:
150
memory = 10,
151
time = 1
152
shell:
153
"""
154
Rscript scripts/getReadLength.r {input.fastq} {output.length}
155
paste <(grep -P -v '^@' {input.sam} | cut -f6 | grep -v '\*' | sed 's/\([MIDNSHP]\)/\\1 /g') <(grep -P -v '^@' {input.sam} | cut -f1,2,3,4 | grep -v -P '\t4\t' ) > {output.cigar}
156
Rscript scripts/getBedFile.r {output.cigar} {output.length} {output.bed}
157
"""
158
159
################################################################################
160
################################################################################
161
######################################## Extract Adapter regions
162
163
# convert reads to fasta
164
rule fastqToFasta:
165
input:
166
fastq = "{X}.fastq"
167
output:
168
fasta = "{X}.fasta"
169
resources:
170
memory = 10,
171
time = 1
172
shell:
173
"awk ' NR % 4 == 1 {{ print $0 ; }} NR % 4 == 2 {{print $0; }}' {input.fastq} |sed 's/@/>/g' | sed 's/ .*$//g' > {output.fasta}"
174
175
#first HMMER iteration
176
rule identifyAdapter_iteration1:
177
params:
178
barcodes = "deposit/barcodes.cDNA.fas",
179
hmmer = config["hmmer"],
180
prefix = "IdentifyAdapter/"
181
input:
182
fasta = "Fastq/" + sampleID + ".fasta"
183
output:
184
tab = "IdentifyAdapter/adapter.{X}.tab"
185
resources:
186
memory = 50,
187
time = 1
188
threads: config["threads"]
189
shell:
190
"""
191
Rscript scripts/getStockholmMSA.r {params.barcodes} {params.prefix}
192
for msa in {params.prefix}/*.msa
193
do
194
{params.hmmer}hmmbuild ${{msa%.msa}}.hmm $msa
195
{params.hmmer}hmmpress ${{msa%.msa}}.hmm
196
done
197
cat {params.prefix}/*.hmm > {params.prefix}adapter.hmm
198
{params.hmmer}hmmpress {params.prefix}adapter.hmm
199
200
{params.hmmer}nhmmscan --noali --notextw --max -E {wildcards.X} --cpu {threads} --tblout {output.tab} {params.prefix}adapter.hmm {input.fasta} > "IdentifyAdapter/temp"
201
"""
202
203
#second HMMER iteration
204
rule identifyAdapter_iteration2:
205
params:
206
barcodes = "deposit/barcodes.cDNA.fas",
207
hmmer = config["hmmer"],
208
prefix = "IdentifyAdapter/{X}/"
209
input:
210
fasta = "Fastq/" + sampleID + ".fasta",
211
tab = "IdentifyAdapter/adapter.{X}.tab"
212
output:
213
tab = "IdentifyAdapter/{X}/adapter.optimized.tab"
214
resources:
215
memory = 50,
216
time = 1
217
threads: config["threads"]
218
shell:
219
"""
220
grep -P -v "^#" {input.tab} | sed -r "s/[[:space:]]+/\t/g" | cut -f1,3,7,8,13 > {params.prefix}"firstHits"
221
Rscript scripts/fetchFasta.r {params.prefix}"firstHits" {input.fasta} {params.prefix} {wildcards.X}
222
for fasta in {params.prefix}*.fasta
223
do
224
p=${{fasta%.fasta}}
225
name=${{p##*/}}
226
{params.hmmer}hmmalign --trim IdentifyAdapter/${{name}}.hmm $fasta > ${{fasta%.fasta}}.optimized.msa
227
{params.hmmer}hmmbuild ${{fasta%.fasta}}.optimized.hmm ${{fasta%.fasta}}.optimized.msa
228
{params.hmmer}hmmpress ${{fasta%.fasta}}.optimized.hmm
229
done
230
cat {params.prefix}*.optimized.hmm > {params.prefix}adapter.optimized.hmm
231
{params.hmmer}hmmpress {params.prefix}adapter.optimized.hmm
232
{params.hmmer}nhmmscan --notextw --max -E 10 --cpu {threads} --tblout {output.tab} {params.prefix}adapter.optimized.hmm {input.fasta} > {params.prefix}"temp"
233
"""
234
235
#get adapter positions in reads
236
rule convert2Bed:
237
input:
238
tab = "{X}.tab"
239
output:
240
bed = "{X}.bed"
241
resources:
242
memory = 10,
243
time = 1
244
shell:
245
"""
246
sed -r "s/[[:space:]]+/\t/g" {input.tab} | grep -v -P "^#" | cut -f1,3,7,8,12,13 | \
247
awk '{{print $2"\t"$3"\t"$4"\t"$1"\t"$6"\t"$5}}' | sort -k1,1 -k2,2n | sed "s/\.optimized//g" > {wildcards.X}.temp
248
Rscript scripts/convertEvalue2Score.r {wildcards.X}.temp {output.bed} 1
249
rm {wildcards.X}.temp
250
"""
251
252
################################################################################
253
################################################################################
254
######################################## Classify Reads
255
256
# join adapter and mapping information
257
rule joinBed:
258
input:
259
adapter = "IdentifyAdapter/{X}/adapter.optimized.bed",
260
mapping = "IGV/" + sampleID + ".bed"
261
output:
262
bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed"
263
resources:
264
memory = 10,
265
time = 1
266
shell:
267
"cat <(grep -P -v '\t0\t[\+\-]+$' {input.adapter}) {input.mapping} | sort -k1,1 -k2,2n > {output.bed}"
268
269
rule mergeBed:
270
params:
271
overlap = 15
272
input:
273
bed = "Classifier_{X}/" + sampleID + ".mapping.adapter.bed"
274
output:
275
bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed",
276
bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed"
277
resources:
278
memory = 10,
279
time = 1
280
shell:
281
"""
282
sort -k1,1 -k2,2n {input.bed} | grep -v supplementary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_primary}
283
sort -k1,1 -k2,2n {input.bed} | grep -v primary | bedtools merge -delim '|' -d -{params.overlap} -c 2,3,4,5,6 -o collapse -i - > {output.bed_supplementary}
284
"""
285
286
# classification
287
rule classifyReads:
288
input:
289
bed_primary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.primary.bed",
290
bed_supplementary = "Classifier_{X}/" + sampleID + ".mapping.adapter.collapse.supplementary.bed",
291
len = "IGV/" + sampleID + ".len.txt"
292
output:
293
classification_temp = "Classifier_{X}/" + sampleID + ".classification.temp",
294
classification = "Classifier_{X}/" + sampleID + ".classification.txt",
295
stat = "Classifier_{X}/" + sampleID + ".classification.stat"
296
resources:
297
memory = 10,
298
time = 1
299
shell:
300
"""
301
Rscript scripts/classifyReads.r {input.bed_primary} {input.bed_supplementary} {input.len} {output.classification_temp} {output.stat}
302
paste {output.classification_temp} <(grep -o -P "primary_.*_[0-9]+|supplementary_.*_[0-9]+" {output.classification_temp} | sed "s/\_/\t/g") | sort -k1,1n > {output.classification}
303
"""
304
305
# sort
306
rule sortByName:
307
input:
308
sam = "{X}.sam",
309
output:
310
sam_sorted = "{X}.sortedName.sam"
311
resources:
312
memory = 10,
313
time = 1
314
shell:
315
"""cat <(samtools view -H {input.sam}) <(grep -v -P "^@" {input.sam} | sort -k1,1n ) > {output.sam_sorted}"""
316
317
# write classification into sam file
318
rule modifySam:
319
input:
320
sam = "Data/" + sampleID + ".sortedName.sam",
321
classification = "Classifier_{X}/" + sampleID + ".classification.txt"
322
output:
323
sam = "Classifier_{X}/" + sampleID + ".classified.sam"
324
resources:
325
memory = 50,
326
time = 1
327
shell:
328
"Rscript scripts/modifySam.r {input.sam} {input.classification} {output.sam}"
329
330
################################################################################
331
################################################################################
332
######################################## Get statistics
333
334
# quantify suspicious species
335
rule informativeReads:
336
params:
337
featureCounts = config["featureCounts"],
338
maskRrna = "deposit/maskRrna.txt",
339
maskRegions = "deposit/maskRegion.txt"
340
input:
341
bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam"
342
output:
343
tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv",
344
tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv"
345
resources:
346
memory = 10,
347
time = 1
348
shell:
349
"""
350
{params.featureCounts} -F SAF -L -a {params.maskRrna} -o {output.tsv} {input.bam}
351
{params.featureCounts} -F SAF -L -a {params.maskRegions} -o {output.tsv2} {input.bam}
352
"""
353
354
# convert to bam
355
rule convertIGV:
356
input:
357
sam = "{path}.sam"
358
output:
359
bam = "{path}.sorted.bam",
360
bai = "{path}.sorted.bam.bai"
361
resources:
362
memory = 10,
363
time = 1
364
shell:
365
"""
366
samtools view -b {input.sam} > {wildcards.path}.bam
367
samtools sort {wildcards.path}.bam -o {output.bam}
368
samtools index {output.bam}
369
"""
370
371
# masked files
372
rule maskeBam:
373
params:
374
maskRrna = "deposit/maskRrna.txt",
375
maskRegions = "deposit/maskRegion.txt"
376
input:
377
bam = "Classifier_{X}/" + sampleID + ".classified.sorted.bam"
378
output:
379
bam = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam",
380
bai = "Classifier_{X}/" + sampleID + ".classified.masked.sorted.bam.bai"
381
resources:
382
memory = 10,
383
time = 1
384
shell:
385
"""
386
cat <(tail -n +2 {params.maskRrna} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}') <(tail -n +2 {params.maskRegions} | awk '{{print $2"\t"$3"\t"$4"\t"$1"\t.\t"$5}}' ) | sort -k1,1 -k2,2n > Classifier_{wildcards.X}/maskRegions.bed
387
bedtools intersect -v -abam {input.bam} -b Classifier_{wildcards.X}/maskRegions.bed > {output.bam}
388
samtools index {output.bam}
389
"""
390
391
# get stats
392
rule compareMinimap2:
393
input:
394
sam = "Classifier_{X}/" + sampleID + ".classified.sam"
395
output:
396
stat = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat"
397
resources:
398
memory = 10,
399
time = 1
400
shell:
401
"""
402
paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f1) <(paste <(paste <(grep -P "ts:A:[+-]+" {input.sam} | cut -f2) <(grep -P -o "ts:A:[+-]+" {input.sam} )) <(grep -P "ts:A:[+-]+" {input.sam} | grep -o -P "ST:A:[+-\.]+")) > "Classifier_{wildcards.X}/temp"
403
Rscript scripts/compareMinimapResults.r "Classifier_{wildcards.X}/temp" {output.stat}
404
"""
405
406
# get stats
407
rule getStats:
408
input:
409
tsv = "Classifier_{X}/" + sampleID + ".classified.maskedRrna.tsv",
410
tsv2 = "Classifier_{X}/" + sampleID + ".classified.maskedRegion.tsv",
411
stat = "Classifier_{X}/" + sampleID + ".classification.stat",
412
stat2 = "Classifier_{X}/" + sampleID + ".classified.minimap2Comp.stat"
413
output:
414
pdf = "" + sampleID + "_{X}.pdf"
415
resources:
416
memory = 10,
417
time = 1
418
shell:
419
"""
420
cat <(cat <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv}.summary | awk '{{sum+=$2}} END {{print sum}}') <(grep "Assigned\|Unassigned_Ambiguity" {input.tsv2}.summary | awk '{{sum+=$2}} END {{print sum}}')) \
421
<(awk '{{sum+=$2}} END {{print sum}}' {input.tsv2}.summary) > Classifier_{wildcards.X}/stat.txt
422
Rscript scripts/plotStatistics.r Classifier_{wildcards.X}/stat.txt {input.stat} {input.stat2} {output.pdf}
423
"""
424
425
#################################################################################